Skip to content

Commit

Permalink
Reverted many changes and heavily simplified the library, removing al…
Browse files Browse the repository at this point in the history
…l features accumulated over the years that no longer work. Basic search I had from version 1 is still working so it's best to do one thing well and stick to it.
  • Loading branch information
MarioVilas committed Jul 11, 2020
1 parent dc38c96 commit b47a156
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 129 deletions.
118 changes: 21 additions & 97 deletions googlesearch/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#!/usr/bin/env python

# Python bindings to the Google search engine
# Copyright (c) 2009-2019, Mario Vilas
# Copyright (c) 2009-2020, Mario Vilas
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
Expand Down Expand Up @@ -32,6 +31,7 @@
import random
import sys
import time
import ssl

if sys.version_info[0] > 2:
from http.cookiejar import LWPCookieJar
Expand All @@ -55,11 +55,6 @@
# Main search function.
'search',

# Specialized search functions.
'search_images', 'search_news',
'search_videos', 'search_shop',
'search_books', 'search_apps',

# Shortcut for "get lucky" search.
'lucky',

Expand All @@ -70,19 +65,19 @@
# URL templates to make Google searches.
url_home = "https://www.google.%(tld)s/"
url_search = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
"btnG=Google+Search&tbs=%(tbs)s&safe=%(safe)s&tbm=%(tpe)s&" \
"btnG=Google+Search&tbs=%(tbs)s&safe=%(safe)s&" \
"cr=%(country)s"
url_next_page = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
"start=%(start)d&tbs=%(tbs)s&safe=%(safe)s&tbm=%(tpe)s&" \
"start=%(start)d&tbs=%(tbs)s&safe=%(safe)s&" \
"cr=%(country)s"
url_search_num = "https://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&" \
"num=%(num)d&btnG=Google+Search&tbs=%(tbs)s&safe=%(safe)s&" \
"tbm=%(tpe)s&cr=%(country)s"
"cr=%(country)s"
url_next_page_num = "https://www.google.%(tld)s/search?hl=%(lang)s&" \
"q=%(query)s&num=%(num)d&start=%(start)d&tbs=%(tbs)s&" \
"safe=%(safe)s&tbm=%(tpe)s&cr=%(country)s"
"safe=%(safe)s&cr=%(country)s"
url_parameters = (
'hl', 'q', 'num', 'btnG', 'start', 'tbs', 'safe', 'tbm', 'cr')
'hl', 'q', 'num', 'btnG', 'start', 'tbs', 'safe', 'cr')

# Cookie jar. Stored at the user's home folder.
# If the cookie jar is inaccessible, the errors are ignored.
Expand Down Expand Up @@ -153,13 +148,15 @@ def get_tbs(from_date, to_date):

# Request the given URL and return the response page, using the cookie jar.
# If the cookie jar is inaccessible, the errors are ignored.
def get_page(url, user_agent=None):
def get_page(url, user_agent=None, verify_ssl=True):
"""
Request the given URL and return the response page, using the cookie jar.
:param str url: URL to retrieve.
:param str user_agent: User agent for the HTTP requests.
Use None for the default.
:param bool verify_ssl: Verify the SSL certificate to prevent
traffic interception attacks. Defaults to True.
:rtype: str
:return: Web page retrieved for the given URL.
Expand All @@ -173,7 +170,11 @@ def get_page(url, user_agent=None):
request = Request(url)
request.add_header('User-Agent', user_agent)
cookie_jar.add_cookie_header(request)
response = urlopen(request)
if verify_ssl:
response = urlopen(request)
else:
context = ssl._create_unverified_context()
response = urlopen(request, context=context)
cookie_jar.extract_cookies(response, request)
html = response.read()
response.close()
Expand Down Expand Up @@ -208,8 +209,8 @@ def filter_result(link):

# Returns a generator that yields URLs.
def search(query, tld='com', lang='en', tbs='0', safe='off', num=10, start=0,
stop=None, domains=None, pause=2.0, tpe='', country='',
extra_params=None, user_agent=None):
stop=None, pause=2.0, country='', extra_params=None,
user_agent=None, verify_ssl=True):
"""
Search the given query string using Google.
Expand All @@ -223,14 +224,9 @@ def search(query, tld='com', lang='en', tbs='0', safe='off', num=10, start=0,
:param int start: First result to retrieve.
:param int stop: Last result to retrieve.
Use None to keep searching forever.
:param list domains: A list of web domains to constrain
the search.
:param float pause: Lapse to wait between HTTP requests.
A lapse too long will make the search slow, but a lapse too short may
cause Google to block your IP. Your mileage may vary!
:param str tpe: Search type (images, videos, news, shopping, books, apps)
Use the following values {videos: 'vid', images: 'isch',
news: 'nws', shopping: 'shop', books: 'bks', applications: 'app'}
:param str country: Country or region to focus the search on. Similar to
changing the TLD, but does not yield exactly the same results.
Only Google knows why...
Expand All @@ -240,6 +236,8 @@ def search(query, tld='com', lang='en', tbs='0', safe='off', num=10, start=0,
{'filter': '0'} which will append '&filter=0' to every query.
:param str user_agent: User agent for the HTTP requests.
Use None for the default.
:param bool verify_ssl: Verify the SSL certificate to prevent
traffic interception attacks. Defaults to True.
:rtype: generator of str
:return: Generator (iterator) that yields found URLs.
Expand All @@ -252,11 +250,6 @@ def search(query, tld='com', lang='en', tbs='0', safe='off', num=10, start=0,
# Count the number of links yielded.
count = 0

# Prepare domain list if it exists.
if domains:
query = query + ' ' + ' OR '.join(
'site:' + domain for domain in domains)

# Prepare the search string.
query = quote_plus(query)

Expand All @@ -276,7 +269,7 @@ def search(query, tld='com', lang='en', tbs='0', safe='off', num=10, start=0,
)

# Grab the cookie from the home page.
get_page(url_home % vars(), user_agent)
get_page(url_home % vars(), user_agent, verify_ssl)

# Prepare the URL of the first request.
if start:
Expand Down Expand Up @@ -309,7 +302,7 @@ def search(query, tld='com', lang='en', tbs='0', safe='off', num=10, start=0,
time.sleep(pause)

# Request the Google Search results page.
html = get_page(url, user_agent)
html = get_page(url, user_agent, verify_ssl)

# Parse the response and get every anchored URL.
if is_bs4:
Expand Down Expand Up @@ -369,75 +362,6 @@ def search(query, tld='com', lang='en', tbs='0', safe='off', num=10, start=0,
url = url_next_page_num % vars()


# Shortcut to search images.
# Beware, this does not return the image link.
def search_images(*args, **kwargs):
"""
Shortcut to search images.
Same arguments and return value as the main search function.
:note: Beware, this does not return the image link.
"""
kwargs['tpe'] = 'isch'
return search(*args, **kwargs)


# Shortcut to search news.
def search_news(*args, **kwargs):
"""
Shortcut to search news.
Same arguments and return value as the main search function.
"""
kwargs['tpe'] = 'nws'
return search(*args, **kwargs)


# Shortcut to search videos.
def search_videos(*args, **kwargs):
"""
Shortcut to search videos.
Same arguments and return value as the main search function.
"""
kwargs['tpe'] = 'vid'
return search(*args, **kwargs)


# Shortcut to search shop.
def search_shop(*args, **kwargs):
"""
Shortcut to search shop.
Same arguments and return value as the main search function.
"""
kwargs['tpe'] = 'shop'
return search(*args, **kwargs)


# Shortcut to search books.
def search_books(*args, **kwargs):
"""
Shortcut to search books.
Same arguments and return value as the main search function.
"""
kwargs['tpe'] = 'bks'
return search(*args, **kwargs)


# Shortcut to search apps.
def search_apps(*args, **kwargs):
"""
Shortcut to search apps.
Same arguments and return value as the main search function.
"""
kwargs['tpe'] = 'app'
return search(*args, **kwargs)


# Shortcut to single-item search.
# Evaluates the iterator to return the single URL as a string.
def lucky(*args, **kwargs):
Expand Down
35 changes: 5 additions & 30 deletions scripts/google
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#!/usr/bin/env python

# Python bindings to the Google search engine
# Copyright (c) 2009-2019, Mario Vilas
# Copyright (c) 2009-2020, Mario Vilas
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
Expand Down Expand Up @@ -65,19 +64,12 @@ def main():
parser.add_option(
'--lang', metavar='LANGUAGE', type='string', default='en',
help="produce results in the given language [default: en]")
parser.add_option(
'--domains', metavar='DOMAINS', type='string', default='',
help="comma separated list of domains to constrain the search to")
parser.add_option(
'--tbs', metavar='TBS', type='string', default='0',
help="produce results from period [default: 0]")
parser.add_option(
'--safe', metavar='SAFE', type='string', default='off',
help="kids safe search [default: off]")
parser.add_option(
'--type', metavar='TYPE', type='string', default='search', dest='tpe',
help="search type (search, images, videos, news, shopping, books,"
" apps) [default: search]")
parser.add_option(
'--country', metavar='COUNTRY', type='string', default='',
help="region to restrict search on [default: not restricted]")
Expand All @@ -94,7 +86,10 @@ def main():
'--pause', metavar='SECONDS', type='float', default=2.0,
help="pause between HTTP requests [default: 2.0]")
parser.add_option(
'--rua', metavar='USERAGENT', action='store_true', default=False,
'--rua', action='store_true', default=False,
help="Randomize the User-Agent [default: no]")
parser.add_option(
'--insecure', dest="verify_ssl", action='store_false', default=True,
help="Randomize the User-Agent [default: no]")
(options, args) = parser.parse_args()
query = ' '.join(args)
Expand All @@ -106,26 +101,6 @@ def main():
if not k.startswith('_')]
params = dict(params)

# Split the comma separated list of domains, if present.
if 'domains' in params:
if params['domains']:
domains = [x.strip() for x in params['domains'].split(',')]
if domains and domains != ['']:
params['domains'] = domains
params['domains'] = None
else:
params['domains'] = None

# Use a special search type if requested.
if 'tpe' in params:
tpe = params['tpe']
if tpe and tpe not in (
'search', 'images', 'videos', 'news',
'shopping', 'books', 'apps'):
parser.error("invalid type: %r" % tpe)
if tpe == 'search':
params['tpe'] = ''

# Randomize the user agent if requested.
if 'rua' in params and params.pop('rua'):
params['user_agent'] = get_random_user_agent()
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

# Copyright (c) 2009-2019, Mario Vilas
# Copyright (c) 2009-2020, Mario Vilas
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
Expand Down Expand Up @@ -44,7 +44,7 @@
scripts=[join('scripts', 'google')],
package_data={'googlesearch': ['user_agents.txt.gz']},
include_package_data=True,
version="2.0.3",
version="3.0.0",
description="Python bindings to the Google search engine.",
author="Mario Vilas",
author_email="[email protected]",
Expand Down

0 comments on commit b47a156

Please sign in to comment.