diff --git a/scraping/scraping/data_formatting.py b/scraping/scraping/data_formatting.py index a64bff16..6b3db0a6 100644 --- a/scraping/scraping/data_formatting.py +++ b/scraping/scraping/data_formatting.py @@ -23,7 +23,9 @@ def format_data(): formated_data = [] for value in new_data.values(): formated_data.append(value) - with open(f"{file}_formatted.json","w") as f: + + output_file = file.rstrip('.json') + with open(f"{output_file}_formatted.json","w") as f: json.dump(formated_data, f, indent=4) if __name__ == "__main__": diff --git a/scraping/scraping/extract_data_mistral.py b/scraping/scraping/extract_data_mistral.py index b501afc8..453ac83f 100644 --- a/scraping/scraping/extract_data_mistral.py +++ b/scraping/scraping/extract_data_mistral.py @@ -9,7 +9,7 @@ def send_to_mistral(input_data): You are a highly accurate extraction tool. I will provide you with the DOI of a scientific publication and its 'Data Availability' section. Analyze the text and extract: - 1. Accession codes (e.g., "XYZ123") or URLs from databases like GEO, ENA, or SRA. + 1. Accession codes or URLs from databases like GEO, ENA, or SRA. 2. Source code URLs from GitHub or Zenodo. Return the results in JSON format: @@ -54,43 +54,46 @@ def send_to_mistral(input_data): "source code": [] } -def process_json_files(directory): +def process_json_files(): - """Generalized filepath in future""" + files = [f for f in os.listdir('.') if f.startswith('_') and f.endswith('formatted.json')] + # file = "_Science Bulletin_Cancer Discovery_Cell Research_udc.json" + for file in files: + print("Filename:", file) - file_path = "_Science Bulletin_Cancer Discovery_Cell Research_udc.json" - with open(file_path, 'r', encoding='utf-8') as file: - data = json.load(file) - - results = [] - for entry in data: - doi = entry.get("DOI", "") - data_availability = entry.get("paragraph", "") - input_data = { - "DOI": doi, - "Data Availability": data_availability - } - print('Processing DOI:', doi) - - try: - # start_time = time.time() - result = send_to_mistral(input_data) - # end_time = time.time() - # print("Execution time:", end_time-start_time) - results.append({ + with open(file, 'r', encoding='utf-8') as f: + data = json.load(f) + + results = [] + for entry in data: + doi = entry.get("DOI", "") + data_availability = entry.get("paragraph", "") + input_data = { "DOI": doi, - "results": result - }) - # print(f"Processed entry. Result: {result}") - except Exception as e: - print(f"Error processing DOI {doi}: {str(e)}") - - # print(results) - # Save results to output file - output_file = "extraction_results_mistral_sciBulletin.json" - with open(output_file, 'w', encoding='utf-8') as f: - json.dump(results, f, indent=4) + "Data Availability": data_availability + } + # print('Processing DOI:', doi) + + try: + # start_time = time.time() + result = send_to_mistral(input_data) + # end_time = time.time() + # print("Execution time:", end_time-start_time) + results.append({ + "DOI": doi, + "results": result + }) + # print(f"Processed entry. Result: {result}") + except Exception as e: + print(f"Error processing DOI {doi}: {str(e)}") + + # print(results) + # Save results to output file + output_file = file.rstrip('.json') + output_file = f"{output_file}_mistral.json" + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(results, f, indent=4) if __name__ == "__main__": - json_directory = os.path.dirname(os.path.abspath(__file__)) - process_json_files(json_directory) \ No newline at end of file + + process_json_files() \ No newline at end of file