-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathapp.py
126 lines (91 loc) · 3.51 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import requests
import json
import csv
import re
laptop_page_url = "https://tiki.vn/api/v2/products?limit=48&include=advertisement&aggregations=1&category=8095&page={}&urlKey=laptop"
product_url = "https://tiki.vn/api/v2/products/{}"
product_id_file = "./data/product-id.txt"
product_data_file = "./data/product.txt"
product_file = "./data/product.csv"
headers = {"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36"}
def crawl_product_id():
product_list = []
i = 1
while (True):
print("Crawl page: ", i)
print(laptop_page_url.format(i))
response = requests.get(laptop_page_url.format(i), headers=headers)
if (response.status_code != 200):
break
products = json.loads(response.text)["data"]
if (len(products) == 0):
break
for product in products:
product_id = str(product["id"])
print("Product ID: ", product_id)
product_list.append(product_id)
i += 1
return product_list, i
def save_product_id(product_list=[]):
file = open(product_id_file, "w+")
str = "\n".join(product_list)
file.write(str)
file.close()
print("Save file: ", product_id_file)
def crawl_product(product_list=[]):
product_detail_list = []
for product_id in product_list:
response = requests.get(product_url.format(product_id), headers=headers)
if (response.status_code == 200):
product_detail_list.append(response.text)
print("Crawl product: ", product_id, ": ", response.status_code)
return product_detail_list
flatten_field = [ "badges", "inventory", "categories", "rating_summary",
"brand", "seller_specifications", "current_seller", "other_sellers",
"configurable_options", "configurable_products", "specifications", "product_links",
"services_and_promotions", "promotions", "stock_item", "installment_info" ]
def adjust_product(product):
e = json.loads(product)
if not e.get("id", False):
return None
for field in flatten_field:
if field in e:
e[field] = json.dumps(e[field], ensure_ascii=False).replace('\n','')
return e
def save_raw_product(product_detail_list=[]):
file = open(product_data_file, "w+")
str = "\n".join(product_detail_list)
file.write(str)
file.close()
print("Save file: ", product_data_file)
def load_raw_product():
file = open(product_data_file, "r")
return file.readlines()
def save_product_list(product_json_list):
file = open(product_file, "w")
csv_writer = csv.writer(file)
count = 0
for p in product_json_list:
if p is not None:
if count == 0:
header = p.keys()
csv_writer.writerow(header)
count += 1
csv_writer.writerow(p.values())
file.close()
print("Save file: ", product_file)
# crawl product id
product_list, page = crawl_product_id()
print("No. Page: ", page)
print("No. Product ID: ", len(product_list))
# save product id for backup
save_product_id(product_list)
# crawl detail for each product id
product_list = crawl_product(product_list)
# save product detail for backup
save_raw_product(product_list)
# product_list = load_raw_product()
# flatten detail before converting to csv
product_json_list = [adjust_product(p) for p in product_list]
# save product to csv
save_product_list(product_json_list)