Skip to content

Commit

Permalink
add node_numver fuction.
Browse files Browse the repository at this point in the history
  • Loading branch information
wuyingren committed May 24, 2017
1 parent 9151e3c commit a87361f
Show file tree
Hide file tree
Showing 6 changed files with 74 additions and 18 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -112,3 +112,4 @@ database.db
dump.rdb
.topics_all.json
settings.py
.node_number.json
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
后来发现v2ex也删贴,我感到有一点失望,但也对这种做法表示理解。

直到有一天我发现 [/t/349115](https://www.v2ex.com/t/349115) 发布不久便被删除了,这让我感到了欺骗。
我萌生了写一个v2ex删贴监测系统的想法。
于是,我萌生了写一个v2ex删贴监测系统的想法。

## 使用方法

Expand Down Expand Up @@ -49,5 +49,5 @@ $ cp settings.py.example settings.py
```

## 使用建议
1. 建议尽可能多的添加代理、添加UA
1. 该所需时间项目较长,请耐心等待。启动后,两周后可查看第一批删贴监测结果
1. 为降低被封概率,请根据需求添加代理、UA
1. 该项目默认设置为两周后开始检测删贴,如需修改,可修改`run.py``tester_tasker`函数中的SQL语句
4 changes: 3 additions & 1 deletion Start_rqworker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ rqworker node2 &> log/node2.log &
rqworker node3 &> log/node3.log &
rqworker node4 &> log/node4.log &
rqworker node5 &> log/node5.log &
rqworker failed &> log/failed.log &
rqworker failed &> log/failed1.log &
rqworker failed &> log/failed2.log &
rqworker failed &> log/failed3.log &
rqworker topic &> log/topic.log &
rqworker tester &> log/tester.log &
echo "Start rqworker,Finished!"
48 changes: 36 additions & 12 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def __init__(self):
self.redis_conn=Redis()
self.load_config()
#start
self.load_time_json()
self.load_json()
self.update_cookies()
try:
self.update_nodes()
Expand All @@ -42,16 +42,35 @@ def __init__(self):
self.tasker()
self.tester_tasker()
#end
self.end()

def end(self):
self.SQ.close_datebase()
with open('.time_log.json','w') as f:
json.dump(self.time_log, f)
self.dump_json()

def load_time_json(self):
def load_json(self):
#load .time_log.json
if os.path.exists('.time_log.json'):
with open('.time_log.json','r') as f:
self.time_log=json.load(f)
else:
self.time_log={'cookies_time':'0','nodes_time':'0','8000_node':'0','4000_node':'0','1000_node':'0','500_node':'0','0_node':'0','rss_time':'0','tester':'0'}
#load .node_number.json
if os.path.exists('.node_number.json'):
with open('.node_number.json','r') as f:
self.node_number=json.load(f)
else:
self.node_number=list()
return

def dump_json(self):
#dump .time_log.json
with open('.time_log.json','w') as f1:
json.dump(self.time_log, f1)
#dump .node_number.json
with open('.node_number.json','w') as f2:
self.node_number=list(set(self.node_number))
json.dump(self.node_number,f2)
return

def update_cookies(self):
Expand All @@ -78,9 +97,7 @@ def update_nodes(self):
if not nodes_time_status:
resp=self.s.get('https://www.v2ex.com/api/nodes/all.json')
if resp.status_code != 200:
self.SQ.close_datebase()
with open('.time_log.json','w') as f:
json.dump(self.time_log, f)
self.end()
raise APIError
nodes=resp.json()
for node in nodes:
Expand All @@ -94,21 +111,24 @@ def update_nodes(self):
footer=node["footer"]
created=node["created"]
n_time=int(time.time())
if self.SQ.node_test(n_id, topics) is True:
self.node_number.append(int(n_id))
self.SQ.write_to_db_node(n_id, name, url, title, title_alternative, topics, header, footer, created, n_time)
self.time_log["nodes_time"]=str(int(time.time()))
self.node_number=list(set(self.node_number))
return

def tasker(self):
node_configs_1=[{'sql':'SELECT ID FROM NODES WHERE topics >= 8000;','sleep_time':5,'between_time':900,'time_log':'8000_node','queue_name':'node1'},
{'sql':'SELECT ID FROM NODES WHERE topics BETWEEN 3000 AND 8000;','sleep_time':10,'between_time':1800,'time_log':'4000_node','queue_name':'node2'},
{'sql':'SELECT ID FROM NODES WHERE topics BETWEEN 1000 AND 3000;','sleep_time':20,'between_time':7200,'time_log':'1000_node','queue_name':'node3'},
{'sql':'SELECT ID FROM NODES WHERE topics BETWEEN 100 AND 1000;','sleep_time':90,'between_time':86400,'time_log':'500_node','queue_name':'node4'},
{'sql':'SELECT ID FROM NODES WHERE topics BETWEEN 1 AND 500;','sleep_time':90,'between_time':172800,'time_log':'0_node','queue_name':'node5'}]
{'sql':'SELECT ID FROM NODES WHERE topics BETWEEN 1 AND 100;','sleep_time':90,'between_time':86400,'time_log':'0_node','queue_name':'node5'}]
node_configs_2=[{'sql':'SELECT ID FROM NODES WHERE topics >= 8000;','sleep_time':5,'between_time':1800,'time_log':'8000_node','queue_name':'node1'},
{'sql':'SELECT ID FROM NODES WHERE topics BETWEEN 3000 AND 8000;','sleep_time':10,'between_time':3600,'time_log':'4000_node','queue_name':'node2'},
{'sql':'SELECT ID FROM NODES WHERE topics BETWEEN 1000 AND 3000;','sleep_time':20,'between_time':14400,'time_log':'1000_node','queue_name':'node3'},
{'sql':'SELECT ID FROM NODES WHERE topics BETWEEN 100 AND 1000;','sleep_time':90,'between_time':86400,'time_log':'500_node','queue_name':'node4'},
{'sql':'SELECT ID FROM NODES WHERE topics BETWEEN 1 AND 500;','sleep_time':90,'between_time':172800,'time_log':'0_node','queue_name':'node5'}]
{'sql':'SELECT ID FROM NODES WHERE topics BETWEEN 1 AND 100;','sleep_time':90,'between_time':86400,'time_log':'0_node','queue_name':'node5'}]
time.tzname=('CST', 'CST')
if int(time.strftime('%H')) >= 8 or int(time.strftime('%H')) < 2:
node_configs=node_configs_1
Expand All @@ -119,13 +139,17 @@ def tasker(self):
sleep_time=node_config['sleep_time']
between_time=node_config['between_time']
time_log_name=node_config['time_log']
q_node=Queue(node_config['queue_name'],connection=self.redis_conn)
queue_name=node_config['queue_name']
q_node=Queue(queue_name,connection=self.redis_conn)
if int(time.time()) - int(self.time_log[time_log_name]) >= between_time:
self.SQ.cursor.execute(sql)
node_ids=self.SQ.cursor.fetchall()
for node_id in node_ids:
node_id=node_id[0]
q_node.enqueue(node_spider.start,node_id,sleep_time)
if queue_name != 'node5' or (queue_name == 'node5' and node_id in self.node_number):
node_id=node_id[0]
if queue_name == 'node5':
self.node_number.remove(int(node_id))
q_node.enqueue(node_spider.start,node_id,sleep_time)
self.time_log[time_log_name]=str(int(time.time()))
return

Expand Down
14 changes: 14 additions & 0 deletions sql/create_table.sql
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,17 @@ CREATE TABLE NODES (
created INTEGER,
time INTEGER
);

CREATE VIEW HUMAN_READER AS
SELECT TOPIC.ID,TOPIC.title,TOPIC.author,TOPIC.author_id,TOPIC.content,TOPIC.content_rendered,TOPIC.replies,
NODES_1.name AS node_name,TOPIC.node AS node_id,
TOPIC.created AS create_time,DATETIME(TOPIC.created,'unixepoch') AS create_time_h,TOPIC.time AS grab_time,DATETIME(TOPIC.time,'unixepoch') AS grab_time_h,
STATUS.TIME AS test_time,DATETIME(STATUS.TIME,'unixepoch') AS test_time_h,STATUS.NODE AS node_id_on_test,NODES_2.name AS node_name_on_test,STATUS.STATUS
FROM TOPIC
INNER JOIN NODES AS NODES_1
ON NODES_1.ID = TOPIC.node
LEFT OUTER JOIN STATUS
ON STATUS.T_ID = TOPIC.ID
LEFT OUTER JOIN NODES AS NODES_2
ON NODES_2.ID = STATUS.NODE
ORDER BY TOPIC.ID ASC;
19 changes: 17 additions & 2 deletions v2ex_base/v2_sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ def __init__(self):
write_to_db_base
>>>SQ.write_to_db_base(t_id,title,author,author_id,content,content_rendered,replies,node,created,n_time)
write_to_db_node
>>>SQ.write_to_db_node(n_id,name,url,title,title_alternative,topics,header,footer,created,n_time))
>>>SQ.write_to_db_node(n_id,name,url,title,title_alternative,topics,header,footer,created,n_time)
node_test
>>>SQ.node_test(node_id,number_now)
'''
self.database_path=settings.database_path

Expand Down Expand Up @@ -50,7 +52,7 @@ def write_to_db_node(self,n_id,name,url,title,title_alternative,topics,header,fo
pass
self.conn.commit()
return

def write_to_db_status(self,T_ID,NODE,STATUS,TIME):
sql="INSERT INTO STATUS (T_ID,NODE,STATUS,TIME) VALUES ( %s );" % ', '.join(['?'] * 4)
try:
Expand All @@ -59,3 +61,16 @@ def write_to_db_status(self,T_ID,NODE,STATUS,TIME):
pass
self.conn.commit()
return

def node_test(self,node_id,number_now):
sql="SELECT topics FROM NODES WHERE ID = %d;" % int(node_id)
self.cursor.execute(sql)
number_old_r=self.cursor.fetchone()
if number_old_r is None:
return True
else:
number_old=number_old_r[0]
if int(number_old) != int(number_now):
return True
else:
return False

0 comments on commit a87361f

Please sign in to comment.