-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_extraction.py
137 lines (112 loc) · 6.92 KB
/
data_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import keepa
import numpy as np
import matplotlib as plt
def save_bestsellers_from_cat(cat, filename):
'''Save bestsellers from given cat to numpy file
:param cat: Amazon category ID
:type cat: str
:param filename: filename to save bestsellers to
:type filename: str
:return: list of bestsellers
:rtype: list of str
'''
assert isinstance(cat, str)
assert isinstance(filename, str)
assert len(cat) > 0 and len(filename) > 0
api_key = 'e6ihvarndmd2iee2bgeg60afm06gru9242g310tb4tv1kji72u57uon4us908d5h'
api = keepa.Keepa(api_key)
# Obtain all bestsellers from category
bestsellers = api.best_sellers_query(cat)
np.save(filename, bestsellers)
print("Best Sellers for Category {0} saved to {1}".format(cat, filename))
return bestsellers
def save_products(product_ids, filename, ratings=True):
'''Save product dicts from given ids into numpy file
:param cat: product IDs
:type cat: list of str
:param filename: filename to save product dictionaries to
:type filename: str
:return: product dicts
:rtype: list of dict
'''
assert isinstance(product_ids, (list, np.ndarray))
assert all(isinstance(i, str) for i in product_ids)
assert isinstance(filename, str)
assert len(filename) > 0
# Keepa API limit
assert isinstance(ratings, bool)
if not ratings:
assert 0 < len(product_ids) <= 300
else:
# Query for ratings and reviews consumes more tokens
assert 0 < len(product_ids) <= 150
api_key = 'e6ihvarndmd2iee2bgeg60afm06gru9242g310tb4tv1kji72u57uon4us908d5h'
api = keepa.Keepa(api_key)
try:
# Obtain product dicts
products = api.query(product_ids, rating=ratings)
np.save(filename, products)
print("Product Dictionaries saved to {0}".format(filename))
return products
# If manual exit from program due to lack of tokens
except:
print("Out of tokens...")
print('Time to refill tokens: ', api.time_to_refill, '\n')
print('Token Status: ', api.update_status())
def save_products_with_price_history(num_products, npy_filenames, filename):
'''Given multiple lists of products dicts, save #num_products with the longest price histories
:param num_products: number of products to save
:type num_products: int
:param npy_filenames: filenames of npy files with product dicts
:type npy_filenames: list of str
:param filename: output filename to save products with longest price history to
:type filename: str
:return: products with the longest price history
:rtype: list of dict
'''
assert isinstance(num_products, int)
assert 0 < num_products
assert all(isinstance(i, str) for i in npy_filenames) or isinstance(npy_filenames, str)
products = []
if isinstance(npy_filenames, list):
for i in npy_filenames:
products += list(np.load(i, allow_pickle=True))
else:
products = list(np.load(i, allow_pickle=True))
products.sort(key=(lambda product : len(product['data']['AMAZON'])), reverse=True)
products = products[0:num_products]
np.save(filename, products)
print("Saved {0} Product Dictionaries with the longest price history to {1}".format(str(num_products), filename))
return products
if __name__ == "__main__":
electronics = '172282'
office_products = '1064954'
softwares = '229534'
toys = '165793011'
save_bestsellers_from_cat(toys, "toys_bestsellers.npy")
# office_products_bestsellers = np.load("office_products_bestsellers.npy", allow_pickle=True)
# save_products(office_products_bestsellers[450:600], "office_products_450_600_ratings.npy", ratings=True)
# office_products_filenames = ["office_products_0_150_ratings.npy","office_products_150_300_ratings.npy","office_products_300_450_ratings.npy", "office_products_450_600_ratings.npy"]
# save_products_with_price_history(200, office_products_filenames, "office_products_sorted_ph.npy")
# save_bestsellers_from_cat(softwares, "software_products_bestsellers.npy")
# software_products_bestsellers = np.load("software_products_bestsellers.npy", allow_pickle=True)
# save_products(software_products_bestsellers[560:570], "software_products_560_570_ratings.npy", ratings=True)
# import glob
# print(glob.glob('*.npy'))
# filenames = ['software_products_520_540_ratings.npy', 'software_products_200_210_ratings.npy', 'software_products_170_180_ratings.npy', 'software_products_180_190_ratings.npy', 'software_products_150_160_ratings.npy', 'software_products_460_470_ratings.npy', 'software_products_390_400_ratings.npy', 'software_products_540_560_ratings.npy', 'software_products_440_450_ratings.npy', 'software_products_250_260_ratings.npy', 'software_products_270_290_ratings.npy', 'software_products_220_240_ratings.npy', 'software_products_310_330_ratings.npy', 'software_products_290_310_ratings.npy', 'software_products_400_420_ratings.npy', 'software_products_350_370_ratings.npy', 'software_products_210_220_ratings.npy', 'software_products_260_270_ratings.npy', 'software_products_0_150_ratings.npy', 'software_products_370_390_ratings.npy', 'software_products_500_520_ratings.npy', 'software_products_330_350_ratings.npy', 'software_products_450_460_ratings.npy', 'software_products_470_490_ratings.npy', 'software_products_160_170_ratings.npy', 'software_products_420_440_ratings.npy', 'software_products_190_200_ratings.npy', 'software_products_240_250_ratings.npy', 'software_products_490_500_ratings.npy']
# save_products_with_price_history(200, filenames, "product_software_sorted_ph.npy")
# # save_bestsellers_from_cat(office_products, "office_products_bestsellers.npy")
# office_products_bestsellers = np.load("office_products_bestsellers.npy", allow_pickle=True)
# save_products(office_products_bestsellers[450:600], "office_products_450_600_ratings.npy", ratings=True)
# office_products_filenames = ["office_products_0_150_ratings.npy","office_products_150_300_ratings.npy","office_products_300_450_ratings.npy", "office_products_450_600_ratings.npy"]
# save_products_with_price_history(200, office_products_filenames, "office_products_sorted_ph.npy")
# save_bestsellers_from_cat(electronics, "bestsellers.npy")
# bestsellers = np.load("bestsellers.npy", allow_pickle=True)
# save_products(bestsellers[150:300], "product_electronics_test_ratings.npy", ratings=True)
# products = np.load("product_electronics_300_600.npy", allow_pickle=True)
# print(len(products))
# filenames = ["product_electronics_0_150_ratings.npy","product_electronics_150_300_ratings.npy","product_electronics_300_450_ratings.npy", "product_electronics_450_600_ratings.npy"]
# save_products_with_price_history(200, filenames, "product_electronics_sorted_ph.npy")
# api_key = 'e6ihvarndmd2iee2bgeg60afm06gru9242g310tb4tv1kji72u57uon4us908d5h'
# api = keepa.Keepa(api_key)
# print('Time to refill tokens: ', api.time_to_refill, '\n')