![]() |
University Research Program for Google Search |
|
Google Research Other Google Resources |
Example CodeThe following is an Python example powered by the University Research Program for Google Search. Download the example here. For additional documentation, click here.
#!/usr/bin/python
#
# Copyright 2007 Google Inc.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#
import getopt
import os
import sys
import urllib
import urllib2
import xml.dom.minidom
BASE_URL = 'http://research.google.com/university/search/service'
DEFAULT_SIZE = 'small'
DEFAULT_START = '0'
NAMESPACE = 'http://research.google.com/university/search'
USAGE = '''Usage: search-example.py [options] --id [project id] terms
This script iterates over University Research Program for Google
Search results and prints the response to stdout.
Use of this service is governed by the terms made available at:
http://research.google.com/university/search/terms.html
Options:
-i --id : the assigned project id [required]
-s --start : the starting search index [optional, default 0]
-z --size : the result size ('small' or 'large') [optional, default 'small']
-h --help : print this help [optional]
Example:
search-example.py --id project-stanford.edu "google code"
'''
class Response(object):
'''A wrapper around the XML of a search response'''
def __init__(self, node):
'''Construct a wrapper around an XML search response.
Exposes the following properties:
terms : the requested search terms
size: the requested number of search results
start: the requested start index (offset 0)
first: the index (offset 1) of the first result in the response
last: the index (offset 1) of the last result in the response
total: the total number of search results
results: a sequence of Result instances
Args:
node: An xml.dom.Node instance containing the search response
'''
# Parse the response for information about the request
self.terms = GetText(node.getElementsByTagName('Q')[0])
params = node.getElementsByTagName('PARAM')
for param in params:
name = param.getAttribute('name')
if name == 'num':
self.size = param.getAttribute('value')
elif name == 'start':
self.start = param.getAttribute('value')
# Parse the response for metadata about the results
res = node.getElementsByTagName('RES')[0]
self.first = res.getAttribute('SN')
self.last = res.getAttribute('EN')
self.total = GetText(res.getElementsByTagName('M')[0])
self.results = []
# Parse the individual results
[self.results.append(Result(r)) for r in res.getElementsByTagName('R')]
def __str__(self):
'''Return a representation of this instance as a unicode string'''
s = 'terms: %s\n' % self.terms
s += 'size: %s\n' % self.size
s += 'start: %s\n' % self.start
s += 'first: %s\n' % self.first
s += 'last: %s\n' % self.last
s += 'total: %s\n' % self.total
s += 'results: \n'
for result in self.results:
s += unicode(result)
return s
class Result(object):
'''A wrapper around the XML of an individual result'''
def __init__(self, node):
'''Construct a wrapper around an XML search result.
Exposes the following properties:
index: the index of the result (offset 1)
url: the address of the page matching the request
encoded_url: the url-encoded address of the page matching the request
title: the title of the page matching the request, includes <b> tags
title_no_bold: the title of the page matching the request, no <b> tags
Args:
node: An xml.dom.Node instance containing a search result
'''
self.index = node.getAttribute('N')
self.url = GetText(node.getElementsByTagName('U')[0])
self.encoded_url = GetText(node.getElementsByTagName('UE')[0])
self.title = GetText(node.getElementsByTagName('T')[0])
self.title_no_bold = GetText(node.getElementsByTagName('TNB')[0])
def __str__(self):
'''Return a representation of this instance as a unicode string'''
s = ' index: %s\n' % self.index
s += ' url: %s\n' % self.url
s += ' encoded_url: %s\n' % self.encoded_url
s += ' title: %s\n' % self.title
s += ' title_no_bold: %s\n' % self.title_no_bold
return s
def GetText(node):
'''Extract the contents of a xml.dom.Nodelist as a string.
Args:
nodelist: An xml.dom.Node instance
Returns:
a string containing the contents of all node.TEXT_NODE instances
'''
text = []
for child in node.childNodes:
if child.nodeType == xml.dom.Node.TEXT_NODE:
text.append(child.data)
return ''.join(text)
def PrintUsageAndExit(message=None):
'''Print the usage message and exit the program.
Args:
message: An error message to print before the usage string.
'''
if message:
print "Error: %s" % message
print USAGE
sys.exit(2)
def Search(id, size, start, terms):
'''Perform a search and print the results to standard out.
Args:
id: the assigned service id
size: the desired size of the search response ('small' or 'large')
start: the index of the first search result
terms: the terms to search for
Returns:
A Response instance representing the search results
'''
values = {'clid': id, 'rsz': size, 'start': start, 'q': terms}
url = '?'.join([BASE_URL, urllib.urlencode(values)])
request = urllib2.Request(url)
print url
response = urllib2.urlopen(request)
document = xml.dom.minidom.parse(response)
return Response(document)
def ParseArgs(args):
'''Parse the command line for the required and optional arguments.
Args:
args: the array of command line arguments, after the program name.
Returns:
A tuple of (id, size, start, terms)
'''
try:
shortflags = 'hi:s:z:'
longflags = ['help', 'id=', 'start=', 'size=']
opts, args = getopt.gnu_getopt(args, shortflags, longflags)
except getopt.GetoptError:
PrintUsageAndExit(getopt.GetoptError.msg)
id = None
size = DEFAULT_SIZE
start = DEFAULT_START
for o, a in opts:
if o in ("-h", "--help"):
PrintUsageAndExit()
if o in ("-i", "--id"):
id = a
if o in ("-s", "--start"):
start = a
if o in ("-z", "--size"):
size = a
if not id:
PrintUsageAndExit('Id required')
terms = ' '.join(args)
if not terms:
PrintUsageAndExit('Could not read search terms')
return (id, size, start, terms)
def main():
(id, size, start, terms) = ParseArgs(sys.argv[1:])
response = Search(id, size, start, terms)
print unicode(response).encode('utf8')
if __name__ == "__main__":
main()
|