-
Notifications
You must be signed in to change notification settings - Fork 0
/
air_ticket_scraping_main.py
168 lines (127 loc) · 6.43 KB
/
air_ticket_scraping_main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import datetime
import pandas as pd
driver = webdriver.Chrome('/your_path/your_project/driver/chromedriver')
# url of Line Travel
air_ticket_url = '''
https://travel.line.me/flights/list?roundType=1&cabinClass=1&numOfAdult=2&numOfChildren=0&numOfBaby=0&linePointsRebateOnly=1&departureAirports=&departureCities=TPE&departureDates=1694390400000&arrivalAirports=&arrivalCities=OSA&departureAirports=&departureCities=OSA&departureDates=1694822400000&arrivalAirports=&arrivalCities=TPE
'''
# This number of seconds can be adjusted and increased according to the network delay problem.
# It is recommended to stay at least 10 seconds or more.
driver.implicitly_wait(10)
driver.get(air_ticket_url)
# load more results to maximize the scraping
def page_scrolldown():
try:
for i in range(1,20):
driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(0.7)
except:
print('Check to see if any code is causing the error.')
pass
page_scrolldown()
# view the html source code of the website
driver.page_source
html_source_code = driver.page_source
driver.close()
soup = BeautifulSoup(html_source_code, 'html.parser')
# check source code without tags
# print(soup.text)
# Display the total number of tickets found
record = soup.select_one('#__next > div.css-1cp3u8n.e1a1bycy0 > div:nth-child(2) > span')
print('搜尋的機票比價總筆數:',record.text)
#
time_loc_list = []
for time_loc_info in soup.select('#__next > div.css-1cp3u8n.e1a1bycy0 > div:nth-child(2)'):
for tl in time_loc_info.select('.css-nkthol.eooboqb1, .css-1qzlsgj.e4pxchi2'):
time = tl.text
time_loc_list.append(time)
# Iterate through the elements in the time_loc_list, taking eight elements at a time to form a tuple,
# and place these tuples into the tuple_list.
tuple_list = [tuple(time_loc_list[i:i+8]) for i in range(0, len(time_loc_list), 8)]
# print(tuple_list)
df_time = pd.DataFrame(tuple_list, columns=["起飛時間(出發)","起飛地點(出發)","抵達時間(出發)","抵達地點(出發)", "起飛時間(回程)","起飛地點(回程)", "抵達時間(回程)","抵達地點(回程)"])
result = []
# Depart/Arrived time + location (aboard/arrived)
for time_loc_info in soup.select('#__next > div.css-1cp3u8n.e1a1bycy0 > div:nth-child(2)'):
for tl in time_loc_info.select('.css-1eowobi .css-j7qwjs'):
time_loc = tl.text
# print(time_loc)
air_ticket_info = soup.find_all(class_='css-1eowobi')
for ticket_info in air_ticket_info:
# airline company
airline = ticket_info.find(class_='css-84a4s3 e1fe20ih3').getText().strip()
# air ticket provider
ticket_site = ticket_info.find(class_='css-6x2xcr e1fe20ih2').getText().strip()
# ticket per price
ticket_per_price = ticket_info.find(class_='css-iw7h7v ejxn77z0').getText().strip().replace(',', '')
# total ticket price(2 ppl)
total_price = ticket_info.find(class_='css-wycfi3 e1fe20ih3').getText().strip('')[9:].replace(',', '')
# ticket purchase url
ticket_purchase_url = 'https://travel.line.me/' + ticket_info.find('a').get('href')
# Using TinyURL Short URL
import pyshorteners
s = pyshorteners.Shortener()
ticket_purchase_short_url = s.tinyurl.short(ticket_purchase_url)
result.append((airline,ticket_site,int(ticket_per_price),int(total_price), ticket_purchase_short_url))
df = pd.DataFrame(result, columns=["航空公司", "購買網站", "一人價格(TWD)", "兩人總價(TWD)", "買票去!"])
# print(result)
# If df1 and df2 have the same number of rows but different column names, you can use the concat function to merge them together.
# The concat function allows you to concatenate multiple DataFrames along a specified axis.
# In this case, you can choose to concatenate along the row axis.
ticket_full_info = pd.concat([df_time, df], axis=1)
ticket_full_info.align
ticket_full_info.index +=1
print(ticket_full_info.head(5))
now = datetime.datetime.now()
month_str = f"{now.month:02d}"
day_str = f"{now.day:02d}"
date = f"{now.year}{month_str}{day_str}"
# Current time
loc_dt = datetime.datetime.today()
loc_dt_format = loc_dt.strftime("%Y/%m/%d %H:%M:%S")
# dataframe to csv
scapring_date = date
ticket_full_info.to_csv(f"{scapring_date}_air_ticket_full_info.csv")
import pygsheets
auth_file = "credentials.json"
gc = pygsheets.authorize(service_file = auth_file)
# sheet read by pygsheets
sheet_url = "https://docs.google.com/spreadsheets/yoursheeturlnamexxxxx"
sheet = gc.open_by_url(sheet_url)
# Select by name
air_ticket_sheet_01 = sheet.worksheet_by_title("air ticket price comparison")
# Update values in the worksheet
# title_date = 'A1'
# air_ticket_sheet_01.update_values(title_date, [['台灣大阪來回機票即時比價報表' + '\n' + loc_dt_format]])
attributes = 'A1'
air_ticket_sheet_01.update_values(attributes, [["起飛時間(出發)","起飛地點(出發)","抵達時間(出發)","抵達地點(出發)", "起飛時間(回程)","起飛地點(回程)", "抵達時間(回程)","抵達地點(回程)","航空公司", "購買網站", "一人價格(TWD)", "兩人總價(TWD)", "買票去!"]])
start_record = 'A2'
# `df.values.tolist()` method can transform the data type `dataframe` to `list`
air_ticket_sheet_01.update_values(start_record, ticket_full_info.values.tolist())
import requests
import schedule
import time
def send_notification():
# Read the data from Google Sheets
data = air_ticket_sheet_01.get_all_records()
# Extract the top three combinations of airlines with the lowest prices
top_three_rows = data[:3]
# Send LINE Notify notification
line_notify_url = "https://notify-api.line.me/api/notify"
msg = '\n台灣大阪來回機票\n即時比價報表\n\n'
for row in top_three_rows:
msg += f'''💎航空公司: {row["航空公司"]}\n💎一人價格(TWD): {row["一人價格(TWD)"]}元\n💎兩人總價(TWD): {row["兩人總價(TWD)"]}元\n✈️哪次不衝了,買票去!\n{row["買票去!"]}\n\n'''
payload={'message':{msg}}
headers = {'Authorization': 'Bearer ' + 'yourtokenhere'}
# Status code
response = requests.request("POST", line_notify_url, headers=headers, data=payload)
print(response.text)
# Set the time for sending notifications (Example: Every day at 10 AM)
schedule.every().day.at("10:00").do(send_notification)
# Infinite loop to keep the script running
while True:
schedule.run_pending()
time.sleep(1)