OSSEval / Code / [5350dd] /OSSEval/OSSEval/utils.py

[5350dd]: OSSEval / OSSEval / utils.py History

utils.py 165 lines (145 with data), 6.5 kB

# This Source Code Form of OSSEval is subject to the terms of the GNU AFFERO
# GENERAL PUBLIC LICENSE, v. 3.0. If a copy of the AGPL was not
# distributed with this file, You can obtain one at http://www.gnu.org/licenses/agpl.txt
#
# OSSeval is powered by the SOS Open Source AGPL edition.
#  The AGPL requires that you do not remove the SOS Open Source attribution and copyright 
#  notices from the user interface (see section 5.d below).

# OSSEval Copyright 2014 Bitergium SLL
# SOS Open Source Copyright 2012 Roberto Galoppini
# Author: Davide Galletti 


from json import JSONEncoder
from apiclient.discovery import build
from apiclient.errors import HttpError
import OSSEval
import urllib2
from lxml import html

class Configuration():
    google_developerKey = OSSEval.google_developerKey

class TrivialJSONEncoder(JSONEncoder):
    def default(self, o):
        return o.__dict__

class xmlMinidom():
    @staticmethod    
    def getString(xmldoc, tag):
        try:
            return xmldoc.getElementsByTagName(tag)[0].firstChild.data
        except:
            return ""

    @staticmethod    
    def getStringAttribute(xmldoc, tag):
        try:
            return xmldoc.attributes[tag].firstChild.data
        except:
            return "" 
        
    @staticmethod    
    def getNaturalAttribute(xmldoc, tag):
        '''
        a natural number; if it's not there -1 is returned
        '''
        try:
            return int(xmldoc.attributes[tag].firstChild.data)
        except:
            return None
        
class SearchEngine():
    @staticmethod
    def search__engine_name():
        return "Gigablast"
    
    @staticmethod
    def search_url_parameters(search_text, sites=[]):
        search_text = search_text.replace(":","%3A").replace("+", "%2B")
        if len(sites) == 0:
            return "q=" + search_text
        else:
            if len(sites) == 1:
                return "q=" + search_text + "+site%3A" + sites[0]
            else:
                query_url = "q=" + search_text + " ("
                or_string = ""
                for site in sites:
                    query_url += or_string + "site%3A" + site
                    or_string = " OR "
                return query_url + ")"

    @staticmethod
    def gigablast_search_url(search_text, sites=[]):
        return "http://www.gigablast.com/search?" + SearchEngine.search_url_parameters(search_text, sites)

    @staticmethod
    def google_search_url(search_text, sites=[]):
        return "https://www.google.com/?" + SearchEngine.search_url_parameters(search_text, sites)
#         search_text = search_text.replace(":","%3A").replace("+", "%2B")
#         if len(sites) == 0:
#             return "https://www.google.com/?#q=" + search_text
#         else:
#             if len(sites) == 1:
#                 return "https://www.google.com/?#q=" + search_text + "+site%3A" + sites[0]
#             else:
#                 query_url = "https://www.google.com/?#q=" + search_text + " ("
#                 or_string = ""
#                 for site in sites:
#                     query_url += or_string + "site%3A" + site
#                     or_string = " OR "
#                 return query_url + ")"

    @staticmethod
    def readable_query(search_text, sites=[]):
        search_text = search_text.replace("%3A", ":").replace("%20", " ").replace("%2B", "+")
        if len(sites) == 0:
            return search_text
        else:
            if len(sites) == 1:
                return search_text + "+site:" + sites[0]
            else:
                query_url = search_text + "("
                or_string = ""
                for site in sites:
                    query_url += or_string + "site:" + site
                    or_string = " OR "
                return query_url + ")"

    @staticmethod
    def search_count_scraping(search_text, sites=[]):
        try:
            response = urllib2.urlopen(SearchEngine.gigablast_search_url(search_text, sites).replace(" ", "%20"))
            html_page = response.read()
            # "Results <b>1</b> to <b>10</b> of about <b>5,278,952</b>"
            # "</form>No results found for <b>"
            str_no_results = "</form>No results found for <b>"
            if str_no_results in html_page:
                return 0 
            str_before = "</b> of about <b>"
            str_after = "</b>"
            temp_string = html_page[html_page.find(str_before) + len(str_before):]
            total = temp_string[:temp_string.find(str_after)].replace(",","")
            return total
        except HttpError as ex:
            return -1

    @staticmethod
    def search_count(search_text, sites=[]):
        return SearchEngine.search_count_scraping(search_text, sites)

    @staticmethod
    def search_url(search_text, sites=[]):
        return SearchEngine.gigablast_search_url(search_text, sites)

    @staticmethod
    def google_search_count(search_text, sites=[]):
        '''
        https://developers.google.com/custom-search/json-api/v1/reference/cse/list
        exactTerms        string     Identifies a phrase that all documents in the search results must contain.
        excludeTerms      string     Identifies a word or phrase that should not appear in any documents in the search results.
        siteSearch        string     Specifies all search results should be pages from a given site.
        siteSearchFilter  string     Controls whether to include or exclude results from the site named in the siteSearch parameter.
        Acceptable values are:            "e": exclude            "i": include
        '''
        try:
            service = build("customsearch", "v1", developerKey=OSSEval.google_developerKey)
            total = 0
            if len(sites) == 0:
                res = service.cse().list(q=search_text, cx='017576662512468239146:omuauf_lfve',).execute()
                total = int(res['searchInformation']['totalResults'])
            else:
                for site in sites:
                    res = service.cse().list(q=search_text, siteSearch=site, cx='017576662512468239146:omuauf_lfve',).execute()
                    total += int(res['searchInformation']['totalResults'])
            return total
        except HttpError as ex:
            return -1

    @staticmethod
    def trends_img(project_name):
        return "<script type=\"text/javascript\" src=\"//www.google.com/trends/embed.js?q=" + project_name + "&content=1&cid=TIMESERIES_GRAPH_0&export=5&w=50&h=40\"></script>"