-
Notifications
You must be signed in to change notification settings - Fork 0
/
energy_policy_full_text_final.py
97 lines (84 loc) · 4.02 KB
/
energy_policy_full_text_final.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import pandas as pd
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import time
import os
from gne import GeneralNewsExtractor
from PyPDF2 import PdfReader
import win32com.client
def change_word_to_txt(word_path, save_path):
word = win32com.client.Dispatch('Word.Application')
doc = word.Documents.Open(word_path)
doc.SaveAs(save_path, 2)
doc.Close()
word.Quit()
save_dir = 'D:/energypolicytext'
# 读取CSV文件
df = pd.read_csv('D://能源政策.csv', encoding='utf-8')
urls = df['URL'].tolist()
titles = df['标题'].tolist()
for index, (url, title) in enumerate(zip(urls, titles)):
try:
response = requests.get(url)
response.encoding = response.apparent_encoding
html = response.text
extractor = GeneralNewsExtractor()
result = extractor.extract(html, with_body_html=False)
content = result.get('content', '')
if '附件' in content:
print("有附件")
driver = webdriver.Edge()
driver.get(url)
attachments_pdf = driver.find_elements(By.XPATH, "//div[@id='UCAP-CONTENT']//a[contains(@href, '.pdf')]")
attachments_doc = driver.find_elements(By.XPATH, "//div[@id='UCAP-CONTENT']//a[contains(@href, '.doc')]")
attachments_docx = driver.find_elements(By.XPATH, "//div[@id='UCAP-CONTENT']//a[contains(@href, '.docx')]")
# Combine all attachments into a single list
attachments = attachments_pdf + attachments_doc + attachments_docx
# Initialize a counter for附件序号
attachment_counter = 1
for attachment in attachments:
link = attachment.get_attribute('href')
print(f"附件链接:{link}")
# Determine the file extension
extension = os.path.splitext(link)[1]
if not extension:
# If extension not found, assume based on the list it came from
if attachment in attachments_pdf:
extension = '.pdf'
elif attachment in attachments_doc:
extension = '.doc'
elif attachment in attachments_docx:
extension = '.docx'
else:
extension = '.unknown'
# Construct the filename
filename = f"energypolicy_{index + 1}_附件{attachment_counter}{extension}"
filepath = os.path.join(save_dir, filename)
try:
response = requests.get(link, stream=True)
response.raise_for_status()
with open(filepath, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
print(f"下载完成: {filepath}")
except requests.exceptions.HTTPError as errh:
print(f"HTTP Error: {errh}")
except requests.exceptions.ConnectionError as errc:
print(f"Error Connecting: {errc}")
except requests.exceptions.Timeout as errt:
print(f"Timeout Error: {errt}")
except requests.exceptions.RequestException as err:
print(f"Error: {err}")
attachment_counter += 1
driver.quit()
filename_txt = f'energypolicy_{index + 1}.txt'
full_path = os.path.join(save_dir, filename_txt)
with open(full_path, 'w', encoding='utf-8') as f:
f.write(content)
print(f'第 {index + 1} 条政策已保存。')
except Exception as e:
print(f'第 {index + 1} 条政策提取失败,原因:{e}')