-
Notifications
You must be signed in to change notification settings - Fork 0
/
monthlynews.py
150 lines (124 loc) · 5.77 KB
/
monthlynews.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
#from webdriver_manager.microsoft import EdgeChromiumDriverManager
#from selenium.webdriver.common.proxy import Proxy, ProxyType
import random
import time
import pandas as pd
import sys
import os
import urllib.parse
months = [
#{'name': 'March', 'start_date': '3/1/2022', 'end_date': '3/31/2022'},
{'name': 'April', 'start_date': '4/1/2022', 'end_date': '4/30/2022'},
{'name': 'May', 'start_date': '5/1/2022', 'end_date': '5/31/2022'},
#{'name': 'June', 'start_date': '6/1/2023', 'end_date': '6/30/2023'},
#{'name': 'July', 'start_date': '7/1/2023', 'end_date': '7/31/2023'},
#{'name': 'August', 'start_date': '8/1/2023', 'end_date': '8/31/2023'},
#{'name': 'September', 'start_date': '9/1/2023', 'end_date': '9/30/2023'},
#{'name': 'October', 'start_date': '10/1/2023', 'end_date': '10/31/2023'},
#{'name': 'November', 'start_date': '11/1/2023', 'end_date': '11/30/2023'},
#{'name': 'December', 'start_date': '12/1/2023', 'end_date': '12/31/2023'}
]
output_folder = 'D:/2022_March_to_December_light_volts_google_news_list'
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# 设置控制台编码为UTF-8
sys.stdout.reconfigure(encoding='utf-8')
DO = 0
for month in months:
if DO:
time.sleep(30)
driver = webdriver.Edge()
query = '光伏产业'
encoded_query = urllib.parse.quote(query)
url = f'https://www.google.com/search?q={encoded_query}&tbs=cdr:1,cd_min:{month["start_date"]},cd_max:{month["end_date"]}&tbm=nws'
driver.get(url)
# 设置显式等待
wait = WebDriverWait(driver, 10)
# 初始化DataFrame
datalist = []
news = 0
page = 0
max_news = 4950
while True:
page = page + 1
print(page)
try:
# 等待政策项容器加载
wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'SoaBEf')))
# 找到所有政策项的容器
items = driver.find_elements(By.CLASS_NAME, 'SoaBEf')
for item in items:
try:
news = news + 1
# URL, title
a_tag = item.find_element(By.TAG_NAME, 'a')
url = a_tag.get_attribute('href')
print(url)
# Use CSS selector for multiple classes and search within 'item'
data_class_pre = item.find_element(By.CSS_SELECTOR, '.lSfe4c.r5bEn.aI5QMe')
data_class = data_class_pre.find_element(By.CLASS_NAME, 'SoAPf')
# Extract summary
summary_class = data_class.find_element(By.CLASS_NAME, 'GI74Re.nDgy9d')
summary = summary_class.text
print(summary)
# Extract source and title
source_and_title_class = data_class.find_element(By.CLASS_NAME, 'MgUUmf.NUnG9d')
source = source_and_title_class.find_element(By.TAG_NAME, 'span').text
print(source)
title = data_class.find_element(By.CLASS_NAME, 'n0jPhd.ynAwRc.MBeuO.nDgy9d').text
print(title)
time_class = data_class.find_element(By.CLASS_NAME, 'OSrXXb.rbYSKb.LfVVr')
pub_time = time_class.find_element(By.TAG_NAME, 'span').text
print(pub_time)
# 存储数据
datalist.append({
'标题': title,
'来源': source,
'发布时间': pub_time,
'概要': summary,
'URL': url
})
print(f'成功爬取第 {news} 条新闻: {title}')
except Exception as e:
print(f'Error extracting item: {e}')
# # 找到并点击翻页链接
# try:
# next_button = driver.find_element(By.ID, "pnnext")
# next_button.click()
# # 等待下一页加载
# time.sleep(3)
# wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'SoaBEf')))
# except news == 4950:
# print("没有更多页面了")
# break
# except Exception as e:
# print(f'Error during pagination: {e}')
# break
# 如果已经达到预定新闻数量,跳出循环
if news >= max_news:
break
# 找到并点击翻页链接
try:
next_button = driver.find_element(By.ID, "pnnext")
next_button.click()
# 等待下一页加载
wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'SoaBEf')))
except NoSuchElementException:
print("没有更多页面了")
break
except (TimeoutException, NoSuchElementException) as e:
print(f'翻页时出错: {e}')
break
# Save data to CSV
file_name = f'{output_folder}/{month["name"]}_news.csv'
data = pd.DataFrame(datalist, columns=['标题', '来源', '发布时间', '概要', 'URL'])
data.to_csv(file_name, index=False, encoding='utf_8_sig')
print(f'{month["name"]}数据已保存到 {file_name}')
# 关闭浏览器
driver.quit()
print("Done!")