forked from sparta-github-group6/show_me_the_recipe-mc6
-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawling_menus.py
93 lines (78 loc) · 3.93 KB
/
crawling_menus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import requests, re
from bs4 import BeautifulSoup
from pymongo import MongoClient
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
client = MongoClient('mongodb://test:test@localhost', 27017)
# client = MongoClient('localhost', 27017)
db = client.dbmaking
for i in range(1, 15): # 1~14페이지까지 반복
link = 'https://terms.naver.com/list.naver?cid=42701&categoryId=62872&so=st1.dsc&viewType=&categoryType=&page=' + str(
i)
data = requests.get(link, headers=headers)
soup = BeautifulSoup(data.text, 'html.parser')
recipes = soup.select('#content > div.list_wrap > ul > li') # 각 레시피별 list
for recipe in recipes:
a_tag = recipe.select_one('div.info_area > div.subject > strong > a')
name = a_tag.text.split('[', maxsplit=1)[0].strip()
recipe_link = 'https://terms.naver.com/' + a_tag['href']
recipe_data = requests.get(recipe_link)
recipe_soup = BeautifulSoup(recipe_data.text, 'html.parser')
lis = str(recipe_soup.findAll("p", {"class": 'txt'})).split('</p>', maxsplit=1)
lis = lis[1].split('</p>', maxsplit=1)
list_for_recipe = re.sub('<.+?>', '', lis[0], 0).strip()
list_for_recipe = list_for_recipe[2:len(list_for_recipe)]
lis = re.sub('<.+?>', '', lis[0], 0).strip().split(',')
list_for_search = list(filter(None, lis))
chkimage = recipe.find(class_='thumb_area')
if chkimage is not None:
img = recipe.select_one('div.thumb_area > div.thumb > a > img')['data-src']
for i in range(len(list_for_search)):
list_for_search[i] = list_for_search[i].split(maxsplit=1)[0]
page3 = recipe_soup.select_one('#size_ct')
list_page3 = page3.find_all(['h3', 'p'])
nl = []
for li in list_page3:
nl.append(li.text)
# print(li.text)
nl = nl[2:len(nl)]
if len(nl) < 26:
if '준비하기' in nl:
nl = ''.join(nl).replace('(4인분)', '')
desc = nl.split('재료 및 분량')[0]
ingredient = nl.split('재료 및 분량')[1].split('준비하기')[0]
precook = nl.split('재료 및 분량')[1].split('준비하기')[1].split('만들기')[0]
making = nl.split('재료 및 분량')[1].split('준비하기')[1].split('만들기')[1]
precook = precook.split('.')
making = making.split('.')
making = list(filter(None, making))
precook = list(filter(None, precook))
precook2 = []
making2 = []
search = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
for word in precook:
if word in search:
precook.remove(word)
for word in making:
if word in search:
making.remove(word)
for f in precook:
precook2.append(f.strip())
for f in making:
making2.append(f.strip())
print(name, list_for_search, list_for_recipe, desc, ingredient, precook2, making2)
doc = {
'name': name,
'search': list_for_search,
'ingredients': list_for_recipe,
'desc': desc,
'ingredient':ingredient,
'precook':precook2,
'making':making2,
'like':0
}
db.recipes.insert_one(doc)
doc2 = {'name': '검색', 'index': '물'}
doc3 = {'name': '검색2', 'index': '계란찜'}
db.search.insert_one(doc2)
db.search.insert_one(doc3)