Parent: [92d055] (diff)

Child: [db36b7] (diff)

Download this file

utils.py    165 lines (145 with data), 6.5 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# This Source Code Form of OSSEval is subject to the terms of the GNU AFFERO
# GENERAL PUBLIC LICENSE, v. 3.0. If a copy of the AGPL was not
# distributed with this file, You can obtain one at http://www.gnu.org/licenses/agpl.txt
#
# OSSeval is powered by the SOS Open Source AGPL edition.
# The AGPL requires that you do not remove the SOS Open Source attribution and copyright
# notices from the user interface (see section 5.d below).
# OSSEval Copyright 2014 Bitergium SLL
# SOS Open Source Copyright 2012 Roberto Galoppini
# Author: Davide Galletti
from json import JSONEncoder
from apiclient.discovery import build
from apiclient.errors import HttpError
import OSSEval
import urllib2
from lxml import html
class Configuration():
google_developerKey = OSSEval.google_developerKey
class TrivialJSONEncoder(JSONEncoder):
def default(self, o):
return o.__dict__
class xmlMinidom():
@staticmethod
def getString(xmldoc, tag):
try:
return xmldoc.getElementsByTagName(tag)[0].firstChild.data
except:
return ""
@staticmethod
def getStringAttribute(xmldoc, tag):
try:
return xmldoc.attributes[tag].firstChild.data
except:
return ""
@staticmethod
def getNaturalAttribute(xmldoc, tag):
'''
a natural number; if it's not there -1 is returned
'''
try:
return int(xmldoc.attributes[tag].firstChild.data)
except:
return None
class SearchEngine():
@staticmethod
def search__engine_name():
return "Gigablast"
@staticmethod
def search_url_parameters(search_text, sites=[]):
search_text = search_text.replace(":","%3A").replace("+", "%2B")
if len(sites) == 0:
return "q=" + search_text
else:
if len(sites) == 1:
return "q=" + search_text + "+site%3A" + sites[0]
else:
query_url = "q=" + search_text + " ("
or_string = ""
for site in sites:
query_url += or_string + "site%3A" + site
or_string = " OR "
return query_url + ")"
@staticmethod
def gigablast_search_url(search_text, sites=[]):
return "http://www.gigablast.com/search?" + SearchEngine.search_url_parameters(search_text, sites)
@staticmethod
def google_search_url(search_text, sites=[]):
return "https://www.google.com/?" + SearchEngine.search_url_parameters(search_text, sites)
# search_text = search_text.replace(":","%3A").replace("+", "%2B")
# if len(sites) == 0:
# return "https://www.google.com/?#q=" + search_text
# else:
# if len(sites) == 1:
# return "https://www.google.com/?#q=" + search_text + "+site%3A" + sites[0]
# else:
# query_url = "https://www.google.com/?#q=" + search_text + " ("
# or_string = ""
# for site in sites:
# query_url += or_string + "site%3A" + site
# or_string = " OR "
# return query_url + ")"
@staticmethod
def readable_query(search_text, sites=[]):
search_text = search_text.replace("%3A", ":").replace("%20", " ").replace("%2B", "+")
if len(sites) == 0:
return search_text
else:
if len(sites) == 1:
return search_text + "+site:" + sites[0]
else:
query_url = search_text + "("
or_string = ""
for site in sites:
query_url += or_string + "site:" + site
or_string = " OR "
return query_url + ")"
@staticmethod
def search_count_scraping(search_text, sites=[]):
try:
response = urllib2.urlopen(SearchEngine.gigablast_search_url(search_text, sites).replace(" ", "%20"))
html_page = response.read()
# "Results <b>1</b> to <b>10</b> of about <b>5,278,952</b>"
# "</form>No results found for <b>"
str_no_results = "</form>No results found for <b>"
if str_no_results in html_page:
return 0
str_before = "</b> of about <b>"
str_after = "</b>"
temp_string = html_page[html_page.find(str_before) + len(str_before):]
total = temp_string[:temp_string.find(str_after)].replace(",","")
return total
except HttpError as ex:
return -1
@staticmethod
def search_count(search_text, sites=[]):
return SearchEngine.search_count_scraping(search_text, sites)
@staticmethod
def search_url(search_text, sites=[]):
return SearchEngine.gigablast_search_url(search_text, sites)
@staticmethod
def google_search_count(search_text, sites=[]):
'''
https://developers.google.com/custom-search/json-api/v1/reference/cse/list
exactTerms string Identifies a phrase that all documents in the search results must contain.
excludeTerms string Identifies a word or phrase that should not appear in any documents in the search results.
siteSearch string Specifies all search results should be pages from a given site.
siteSearchFilter string Controls whether to include or exclude results from the site named in the siteSearch parameter.
Acceptable values are: "e": exclude "i": include
'''
try:
service = build("customsearch", "v1", developerKey=OSSEval.google_developerKey)
total = 0
if len(sites) == 0:
res = service.cse().list(q=search_text, cx='017576662512468239146:omuauf_lfve',).execute()
total = int(res['searchInformation']['totalResults'])
else:
for site in sites:
res = service.cse().list(q=search_text, siteSearch=site, cx='017576662512468239146:omuauf_lfve',).execute()
total += int(res['searchInformation']['totalResults'])
return total
except HttpError as ex:
return -1
@staticmethod
def trends_img(project_name):
return "<script type=\"text/javascript\" src=\"//www.google.com/trends/embed.js?q=" + project_name + "&content=1&cid=TIMESERIES_GRAPH_0&export=5&w=50&h=40\"></script>"