NN-617 Extract source code & sequencing data

BackofenLab · Dec 12, 2024 · 7abf730 · 7abf730
1 parent eccbe78
commit 7abf730
Show file tree

Hide file tree

Showing 2 changed files with 42 additions and 37 deletions.
diff --git a/scraping/scraping/data_formatting.py b/scraping/scraping/data_formatting.py
@@ -23,7 +23,9 @@ def format_data():
         formated_data = []
         for value in new_data.values():
             formated_data.append(value)
-        with open(f"{file}_formatted.json","w") as f:
+
+        output_file = file.rstrip('.json')
+        with open(f"{output_file}_formatted.json","w") as f:
             json.dump(formated_data, f, indent=4)
 
 if __name__ == "__main__":

diff --git a/scraping/scraping/extract_data_mistral.py b/scraping/scraping/extract_data_mistral.py
@@ -9,7 +9,7 @@ def send_to_mistral(input_data):
     You are a highly accurate extraction tool. I will provide you with the DOI of a scientific publication and its 'Data Availability' section.
 
     Analyze the text and extract:
-    1. Accession codes (e.g., "XYZ123") or URLs from databases like GEO, ENA, or SRA.
+    1. Accession codes or URLs from databases like GEO, ENA, or SRA.
     2. Source code URLs from GitHub or Zenodo.
 
     Return the results in JSON format:
@@ -54,43 +54,46 @@ def send_to_mistral(input_data):
             "source code": []
         }
 
-def process_json_files(directory):
+def process_json_files():
 
-    """Generalized filepath in future"""
+    files = [f for f in os.listdir('.') if f.startswith('_') and f.endswith('formatted.json')]
+    # file = "_Science Bulletin_Cancer Discovery_Cell Research_udc.json"
+    for file in files:
+        print("Filename:", file)
 
-    file_path = "_Science Bulletin_Cancer Discovery_Cell Research_udc.json"
-    with open(file_path, 'r', encoding='utf-8') as file:
-        data = json.load(file)
-
-        results = []
-        for entry in data:
-            doi = entry.get("DOI", "")
-            data_availability = entry.get("paragraph", "")
-            input_data = {
-                "DOI": doi,
-                "Data Availability": data_availability
-            }
-            print('Processing DOI:', doi)
-
-            try:
-                # start_time = time.time()
-                result = send_to_mistral(input_data)
-                # end_time = time.time()
-                # print("Execution time:", end_time-start_time)
-                results.append({
+        with open(file, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+
+            results = []
+            for entry in data:
+                doi = entry.get("DOI", "")
+                data_availability = entry.get("paragraph", "")
+                input_data = {
                     "DOI": doi,
-                    "results": result
-                })
-                # print(f"Processed entry. Result: {result}")
-            except Exception as e:
-                print(f"Error processing DOI {doi}: {str(e)}")
-
-        # print(results)
-        # Save results to output file
-        output_file = "extraction_results_mistral_sciBulletin.json"
-        with open(output_file, 'w', encoding='utf-8') as f:
-            json.dump(results, f, indent=4)
+                    "Data Availability": data_availability
+                }
+                # print('Processing DOI:', doi)
+
+                try:
+                    # start_time = time.time()
+                    result = send_to_mistral(input_data)
+                    # end_time = time.time()
+                    # print("Execution time:", end_time-start_time)
+                    results.append({
+                        "DOI": doi,
+                        "results": result
+                    })
+                    # print(f"Processed entry. Result: {result}")
+                except Exception as e:
+                    print(f"Error processing DOI {doi}: {str(e)}")
+
+            # print(results)
+            # Save results to output file
+            output_file = file.rstrip('.json')
+            output_file = f"{output_file}_mistral.json"
+            with open(output_file, 'w', encoding='utf-8') as f:
+                json.dump(results, f, indent=4)
 
 if __name__ == "__main__":
-    json_directory = os.path.dirname(os.path.abspath(__file__))
-    process_json_files(json_directory)
+
+    process_json_files()