-
Notifications
You must be signed in to change notification settings - Fork 0
/
rip8progressbar.py
52 lines (43 loc) · 2.04 KB
/
rip8progressbar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import os
import json
import argparse
from bs4 import BeautifulSoup
from tqdm import tqdm # Import tqdm for the progress bar
def extract_text_from_paragraphs(html_file):
with open(html_file, 'r', encoding='utf-8') as file:
soup = BeautifulSoup(file, 'html.parser')
paragraphs = soup.find_all('p')
text = '\n'.join(paragraph.get_text() for paragraph in paragraphs)
h1_text = soup.find('h1')
h1_text = h1_text.get_text().strip() if h1_text else "No Title Found"
return text, h1_text
def remove_excessive_blank_lines(text):
lines = text.split('\n')
cleaned_lines = [line for line in lines if line.strip()]
return '\n'.join(cleaned_lines)
def main(input_directory, output_file):
html_files = []
for root, dirs, files in os.walk(input_directory):
for file in files:
if file.endswith('.html'):
html_files.append(os.path.join(root, file))
# Initialize tqdm progress bar with the total number of HTML files
with open(output_file, 'w', encoding='utf-8') as outfile, tqdm(total=len(html_files),
desc="Processing HTML Files") as pbar:
for html_file in html_files:
text, h1_text = extract_text_from_paragraphs(html_file)
cleaned_text = remove_excessive_blank_lines(text)
json_record = {
"title": h1_text,
"text": cleaned_text
}
json_line = json.dumps(json_record, ensure_ascii=False)
outfile.write(json_line + '\n')
pbar.update(1) # Update the progress bar
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Extract text from HTML files within a directory and save them in a JSONL file.")
parser.add_argument("input_directory", type=str, help="Directory containing HTML files to process")
parser.add_argument("output_file", type=str, help="Output JSONL file name")
args = parser.parse_args()
main(args.input_directory, args.output_file)