-
Notifications
You must be signed in to change notification settings - Fork 0
/
cvpr_get_paper.py
111 lines (91 loc) · 3.68 KB
/
cvpr_get_paper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import uuid
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
import requests
from bs4 import BeautifulSoup
from lxml import etree
import time
import io
import socket
import csv
import threading
import os
import arxiv
def download_arxiv_paper(id_item, save_path, filename):
paper = next(arxiv.Search(id_list=[id_item]).results())
paper.download_pdf(dirpath=save_path,filename=filename)
def get_paper_url( url, url_txt ):
url_txt_writer = open(url_txt,'a')
# chrome_options = webdriver.ChromeOptions()
# prefs = {"profile.managed_default_content_settings.images":2}
# chrome_options.add_experimental_option("prefs",prefs)
#
# option = webdriver.ChromeOptions()
# option.add_experimental_option('excludeSwitches',['enable-automation'])
# wb =webdriver.Chrome(options=chrome_options)
option = webdriver.ChromeOptions()
# 设置为开发者模式
option.add_experimental_option('excludeSwitches', ['enable-automation'])
option.add_argument("--disable-blink-features=AutomationControlled")
wb = webdriver.Chrome(options=option)
wb.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': 'Object.defineProperty(navigator, "webdriver", {get: () => undefined})'
})
#wb =webdriver.Chrome(options=chrome_options)
wb.maximize_window()
wb.get(url)
wb.implicitly_wait(5)
data = wb.page_source
time.sleep(3)
page_all = []
page_all.append(data)
html = etree.HTML(data)
class_number = html.xpath('/html/body/div[1]/div[2]/div/div[3]/div/div/div')
for num_class in range(1, len(class_number)+1,1):
if num_class > len(class_number):
break
link_number = html.xpath('/html/body/div[1]/div[2]/div/div[3]/div/div/div['+str(num_class)+']/table/tbody/tr')
for num in range(1,len(link_number)+1,1):
if num > len(link_number):
break
links = html.xpath('/html/body/div[1]/div[2]/div/div[3]/div/div/div['
+str(num_class)+']/table/tbody/tr['+str(num)+']/td[4]/a')
for link in links:
url_txt_writer.write(link.get("href")+"\n")
url_txt_writer.close()
def parse_url_txt( url_txt, save_path ):
if not os.path.exists(save_path):
os.makedirs(save_path)
with open( url_txt ) as infile:
lines = infile.readlines()
for line in lines:
url = line.strip()
if url.endswith(".pdf"):
try:
req = requests.get(url)
time.sleep(10)
bytes_io = io.BytesIO(req.content)
filename = uuid.uuid4().hex
with open(save_path+filename+".pdf","wb") as file:
file.write((bytes_io.getvalue()))
except:
continue
elif 'arxiv' in url:
try:
id_item = url.split('abs/')[1]
filename = uuid.uuid4().hex+".pdf"
download_arxiv_paper(id_item,save_path,filename)
except:
continue
if __name__=="__main__":
years = ['2017','2018','2019','2020','2021','2022','2023']
url_txt = "link.txt"
for year in years:
year = year.strip()
url = 'https://gaplab.cuhk.edu.cn/cvpapers/#/'+str(year)+'cvpr'
get_paper_url(url,url_txt)
# url_txt = "link.txt"
# save_path = './papers/'
# parse_url_txt(url_txt,save_path)