fix: download limit

CatchZeng · Mar 7, 2021 · 8b2cd71 · 8b2cd71
1 parent 014caa6
commit 8b2cd71
Show file tree

Hide file tree

Showing 2 changed files with 21 additions and 10 deletions.
diff --git a/bing_images/bing.py b/bing_images/bing.py
@@ -16,42 +16,51 @@
 def fetch_image_urls(
     query: str,
     limit: int = 20,
-    file_type: str = "jpg",
+    file_type: str = '',
     filters: str = ''
 ) -> List[str]:
     result = list()
-    keywords = query + " " + file_type
-    max_number = math.ceil(limit*1.5)
-    urls = crawl_image_urls(keywords, filters, max_number)
+    keywords = query
+    if len(file_type) > 0:
+        keywords = query + " " + file_type
+    urls = crawl_image_urls(keywords, filters, limit)
     for url in urls:
-        if url.endswith(file_type) and url not in result:
+        if isValidURL(url, file_type) and url not in result:
             result.append(url)
             if len(result) >= limit:
                 break
     return result
 
 
+def isValidURL(url, file_type):
+    if len(file_type) < 1:
+        return True
+    return url.endswith(file_type)
+
+
 def download_images(
     query: str,
     limit: int = 20,
     output_dir='',
     pool_size: int = 20,
-    file_type: str = "jpg",
+    file_type: str = '',
     filters: str = '',
     force_replace=False
 ):
     start = timer()
     image_dir = make_image_dir(output_dir, force_replace)
     print("Save path: {}".format(image_dir))
 
-    urls = fetch_image_urls(query, limit, file_type, filters)
+    # Fetch more image URLs to avoid some images are invalid.
+    max_number = math.ceil(limit*1.5)
+    urls = fetch_image_urls(query, max_number, file_type, filters)
     entries = get_image_entries(urls, image_dir)
 
     print("Downloading images")
     ps = pool_size
     if limit < pool_size:
         ps = limit
-    download_image_entries(entries, ps)
+    download_image_entries(entries, ps, limit)
 
     rename_images(image_dir, query)
 
@@ -76,11 +85,13 @@ def rename_images(dir, prefix):
     print("Finished renaming")
 
 
-def download_image_entries(entries, pool_size):
+def download_image_entries(entries, pool_size, limit):
     counter = 1
     results = ThreadPool(pool_size).imap_unordered(
         download_image_with_thread, entries)
     for (url, result) in results:
+        if counter > limit:
+            break
         if result:
             print("#{} {} Downloaded".format(counter, url))
             counter = counter + 1

diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = bing_images
-version = 0.0.6
+version = 0.1.0
 author = CatchZeng
 author_email = [email protected]
 description = Python library to fetch image urls and download using multithreading from Bing.com.