Skip to content

Commit

Permalink
NN-617 Extract source code & sequencing data
Browse files Browse the repository at this point in the history
  • Loading branch information
mominaatifdar committed Dec 12, 2024
1 parent eccbe78 commit 7abf730
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 37 deletions.
4 changes: 3 additions & 1 deletion scraping/scraping/data_formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ def format_data():
formated_data = []
for value in new_data.values():
formated_data.append(value)
with open(f"{file}_formatted.json","w") as f:

output_file = file.rstrip('.json')
with open(f"{output_file}_formatted.json","w") as f:
json.dump(formated_data, f, indent=4)

if __name__ == "__main__":
Expand Down
75 changes: 39 additions & 36 deletions scraping/scraping/extract_data_mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ def send_to_mistral(input_data):
You are a highly accurate extraction tool. I will provide you with the DOI of a scientific publication and its 'Data Availability' section.
Analyze the text and extract:
1. Accession codes (e.g., "XYZ123") or URLs from databases like GEO, ENA, or SRA.
1. Accession codes or URLs from databases like GEO, ENA, or SRA.
2. Source code URLs from GitHub or Zenodo.
Return the results in JSON format:
Expand Down Expand Up @@ -54,43 +54,46 @@ def send_to_mistral(input_data):
"source code": []
}

def process_json_files(directory):
def process_json_files():

"""Generalized filepath in future"""
files = [f for f in os.listdir('.') if f.startswith('_') and f.endswith('formatted.json')]
# file = "_Science Bulletin_Cancer Discovery_Cell Research_udc.json"
for file in files:
print("Filename:", file)

file_path = "_Science Bulletin_Cancer Discovery_Cell Research_udc.json"
with open(file_path, 'r', encoding='utf-8') as file:
data = json.load(file)

results = []
for entry in data:
doi = entry.get("DOI", "")
data_availability = entry.get("paragraph", "")
input_data = {
"DOI": doi,
"Data Availability": data_availability
}
print('Processing DOI:', doi)

try:
# start_time = time.time()
result = send_to_mistral(input_data)
# end_time = time.time()
# print("Execution time:", end_time-start_time)
results.append({
with open(file, 'r', encoding='utf-8') as f:
data = json.load(f)

results = []
for entry in data:
doi = entry.get("DOI", "")
data_availability = entry.get("paragraph", "")
input_data = {
"DOI": doi,
"results": result
})
# print(f"Processed entry. Result: {result}")
except Exception as e:
print(f"Error processing DOI {doi}: {str(e)}")

# print(results)
# Save results to output file
output_file = "extraction_results_mistral_sciBulletin.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(results, f, indent=4)
"Data Availability": data_availability
}
# print('Processing DOI:', doi)

try:
# start_time = time.time()
result = send_to_mistral(input_data)
# end_time = time.time()
# print("Execution time:", end_time-start_time)
results.append({
"DOI": doi,
"results": result
})
# print(f"Processed entry. Result: {result}")
except Exception as e:
print(f"Error processing DOI {doi}: {str(e)}")

# print(results)
# Save results to output file
output_file = file.rstrip('.json')
output_file = f"{output_file}_mistral.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(results, f, indent=4)

if __name__ == "__main__":
json_directory = os.path.dirname(os.path.abspath(__file__))
process_json_files(json_directory)

process_json_files()

0 comments on commit 7abf730

Please sign in to comment.