Skip to content

Commit

Permalink
feat: add extra_query_params
Browse files Browse the repository at this point in the history
  • Loading branch information
CatchZeng committed Jan 25, 2022
1 parent 9a5d728 commit c5c1e14
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 10 deletions.
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Python library to fetch image urls and download using **multithreading** from [B

- [x] Support **file type** filters.
- [x] Support [Bing.com](https://bing.com/) **filterui** filters.
- [x] Support **extra query params**, such as `&first=100&tsc=ImageBasicHover` in `https://cn.bing.com/images/search?q=cat&first=100&tsc=ImageBasicHover`
- [x] Download using **multithreading** and custom thread **pool size**.
- [x] Support **purely** obtaining the image urls.

Expand All @@ -36,7 +37,7 @@ fetch_image_urls.py
```py
from bing_images import bing

urls = bing.fetch_image_urls("cat", limit=10, file_type='png', filters='+filterui:aspect-square+filterui:color2-bw')
urls = bing.fetch_image_urls("cat", limit=10, file_type='png', filters='+filterui:aspect-square+filterui:color2-bw', extra_query_params='&first=1')
print("{} images.".format(len(urls)))
counter = 1
for url in urls:
Expand Down Expand Up @@ -76,7 +77,8 @@ bing.download_images("cat",
output_dir="/Users/catchzeng/Desktop/cat",
pool_size=10,
file_type="png",
force_replace=True)
force_replace=True,
extra_query_params='&first=1')
```

> - **output_dir**: the default output_dir is `os.path.join(os.getcwd(), "bing-images")`
Expand Down
10 changes: 6 additions & 4 deletions bing_images/bing.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,14 @@ def fetch_image_urls(
query: str,
limit: int = 20,
file_type: str = '',
filters: str = ''
filters: str = '',
extra_query_params: str =''
) -> List[str]:
result = list()
keywords = query
if len(file_type) > 0:
keywords = query + " " + file_type
urls = crawl_image_urls(keywords, filters, limit)
urls = crawl_image_urls(keywords, filters, limit, extra_query_params)
for url in urls:
if isValidURL(url, file_type) and url not in result:
result.append(url)
Expand All @@ -47,15 +48,16 @@ def download_images(
pool_size: int = 20,
file_type: str = '',
filters: str = '',
force_replace=False
force_replace=False,
extra_query_params: str =''
):
start = timer()
image_dir = make_image_dir(output_dir, force_replace)
print("Save path: {}".format(image_dir))

# Fetch more image URLs to avoid some images are invalid.
max_number = math.ceil(limit*1.5)
urls = fetch_image_urls(query, max_number, file_type, filters)
urls = fetch_image_urls(query, max_number, file_type, filters, extra_query_params)
entries = get_image_entries(urls, image_dir)

print("Downloading images")
Expand Down
7 changes: 4 additions & 3 deletions bing_images/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,12 @@
BASE_URL = "https://www.bing.com/images/search?"


def gen_query_url(keywords, filters):
def gen_query_url(keywords, filters, extra_query_params =''):
keywords_str = "&q=" + quote(keywords)
query_url = BASE_URL + keywords_str
if len(filters) > 0:
query_url += "&qft="+filters
query_url += extra_query_params
return query_url


Expand Down Expand Up @@ -43,7 +44,7 @@ def image_url_from_webpage(driver, max_number=10000):
return image_urls


def crawl_image_urls(keywords, filters, max_number=10000, proxy=None, proxy_type="http"):
def crawl_image_urls(keywords, filters, max_number=10000, proxy=None, proxy_type="http", extra_query_params =''):
chrome_path = shutil.which("chromedriver")
chrome_path = "./bin/chromedriver" if chrome_path is None else chrome_path
chrome_options = webdriver.ChromeOptions()
Expand All @@ -52,7 +53,7 @@ def crawl_image_urls(keywords, filters, max_number=10000, proxy=None, proxy_type
"--proxy-server={}://{}".format(proxy_type, proxy))
driver = webdriver.Chrome(chrome_path, chrome_options=chrome_options)

query_url = gen_query_url(keywords, filters)
query_url = gen_query_url(keywords, filters, extra_query_params)
driver.set_window_size(1920, 1080)
driver.get(query_url)
image_urls = image_url_from_webpage(driver, max_number)
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = bing_images
version = 0.1.1
version = 0.2.1
author = CatchZeng
author_email = [email protected]
description = Python library to fetch image urls and download using multithreading from Bing.com.
Expand Down

0 comments on commit c5c1e14

Please sign in to comment.