Skip to content

Commit

Permalink
fix: download limit
Browse files Browse the repository at this point in the history
  • Loading branch information
CatchZeng committed Mar 7, 2021
1 parent 014caa6 commit 8b2cd71
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 10 deletions.
29 changes: 20 additions & 9 deletions bing_images/bing.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,42 +16,51 @@
def fetch_image_urls(
query: str,
limit: int = 20,
file_type: str = "jpg",
file_type: str = '',
filters: str = ''
) -> List[str]:
result = list()
keywords = query + " " + file_type
max_number = math.ceil(limit*1.5)
urls = crawl_image_urls(keywords, filters, max_number)
keywords = query
if len(file_type) > 0:
keywords = query + " " + file_type
urls = crawl_image_urls(keywords, filters, limit)
for url in urls:
if url.endswith(file_type) and url not in result:
if isValidURL(url, file_type) and url not in result:
result.append(url)
if len(result) >= limit:
break
return result


def isValidURL(url, file_type):
if len(file_type) < 1:
return True
return url.endswith(file_type)


def download_images(
query: str,
limit: int = 20,
output_dir='',
pool_size: int = 20,
file_type: str = "jpg",
file_type: str = '',
filters: str = '',
force_replace=False
):
start = timer()
image_dir = make_image_dir(output_dir, force_replace)
print("Save path: {}".format(image_dir))

urls = fetch_image_urls(query, limit, file_type, filters)
# Fetch more image URLs to avoid some images are invalid.
max_number = math.ceil(limit*1.5)
urls = fetch_image_urls(query, max_number, file_type, filters)
entries = get_image_entries(urls, image_dir)

print("Downloading images")
ps = pool_size
if limit < pool_size:
ps = limit
download_image_entries(entries, ps)
download_image_entries(entries, ps, limit)

rename_images(image_dir, query)

Expand All @@ -76,11 +85,13 @@ def rename_images(dir, prefix):
print("Finished renaming")


def download_image_entries(entries, pool_size):
def download_image_entries(entries, pool_size, limit):
counter = 1
results = ThreadPool(pool_size).imap_unordered(
download_image_with_thread, entries)
for (url, result) in results:
if counter > limit:
break
if result:
print("#{} {} Downloaded".format(counter, url))
counter = counter + 1
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = bing_images
version = 0.0.6
version = 0.1.0
author = CatchZeng
author_email = [email protected]
description = Python library to fetch image urls and download using multithreading from Bing.com.
Expand Down

0 comments on commit 8b2cd71

Please sign in to comment.