-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathamazonData.py
76 lines (66 loc) · 3.03 KB
/
amazonData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import pandas as pd
from os import link
from string import Template
from bs4 import BeautifulSoup
from selenium import webdriver
import os
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = os.environ.get("GOOGLE_CHROME_BIN")
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-dev-shm-usage ")
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
driver = webdriver.Chrome(executable_path=os.environ.get("CHROMEDRIVER_PATH"), chrome_options=chrome_options)
linkOption = input("Do you want to search product with link (yes/no): ")
if linkOption == 'yes' or linkOption == 'Yes' or linkOption == 'YES':
reviewlist = []
def get_url(search_term):
template = "{}"
return template.format(search_term)
product = input("Enter the Product link of which you want to get Sentiment:")
#print(product) #product search
url = get_url(product) #Product link
driver.get(url)
"""Extract the collection"""
soup = BeautifulSoup(driver.page_source, 'html.parser')
sub_review_url = soup.find('a', {'data-hook': 'see-all-reviews-link-foot'})
review_url = sub_review_url.get('href')
driver.get("https://www.amazon.in"+review_url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
reviews = soup.find_all('div', {'data-hook': 'review'})
for item in reviews:
review = [
#'product': soup.title.text.replace('Amazon.in:Customer reviews:','').strip(),
#'title': item.find('a', {'data-hook': 'review-title'}).text.strip(),
#'rating': float(item.find('i', {'data-hook': 'review-star-rating'}).text.replace('out of 5 stars', '').strip()),
item.find('span', {'data-hook': 'review-body'}).text.strip(),
]
#print(review)
reviewlist.append(review)
def view_comments():
reviews = soup.find_all('div', {'data-hook': 'review'})
for item in reviews:
review = [
#'product': soup.title.text.replace('Amazon.in:Customer reviews:','').strip(),
#'title': item.find('a', {'data-hook': 'review-title'}).text.strip(),
#'rating': float(item.find('i', {'data-hook': 'review-star-rating'}).text.replace('out of 5 stars', '').strip()),
item.find('span', {'data-hook': 'review-body'}).text.strip(),
]
#print(review)
reviewlist.append(review)
for x in range(1,30):
next_page = soup.find('div', {'class': 'a-form-actions a-spacing-top-extra-large'})
next_page1 = next_page.find('li', {'class': 'a-last'})
next_page2 = next_page1.find('a')
next_page3 = next_page2.get('href')
driver.get("https://www.amazon.in"+next_page3)
soup = BeautifulSoup(driver.page_source, 'html.parser')
view_comments()
if not soup.find('li', {'class':'a-disabled a-last'}):
pass
else:
break
#print(*reviewlist, sep = "\n")
df = pd.DataFrame(reviewlist)
df.to_excel('livedataset.xlsx', index=False)
print('Finished..')