crawler.py
2.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import requests
import urlparse
def fetech_all_tasks():
# fetech the url list from platform
platform_url = "http://api.platform.yohoops.org:8088/platform/productUrl/queryUrlList?page=0&size=50000";
r = requests.get(platform_url).json()
if r['code'] != 200:
print("Fetch task failed! %s" % r )
return
tasks = {} # id -> url
for item in r['data']:
url = item['jdUrl']
# proccessing taobaoUrl or tmallUrl
if not url :
if item['tmallUrl']:
url = item['tmallUrl']
if item['taobaoUrl']:
url = item['taobaoUrl']
if not url:
print("url not found at %s" %item)
continue
# get id & skuId
query = urlparse.urlparse(url)
params = urlparse.parse_qs(query.query)
id = params.get('id')
skuId = params.get('skuId')
if not id :
print("can not get id from url: %s" % url)
continue
if not skuId:
skuId = ''
# final taobao or tmall url
url = ("%s://%s/item.html?id=%s&skuId=%s" %(query.scheme, query.netloc, id, skuId)).encode('utf-8')
tasks[item['productId']] = url
return tasks
def write_to_file(tasks = {}):
f = open("/tmp/url_list", "w+")
for id, url in tasks.items():
f.write("%s\r\n" %url)
f.close()
def get_info_from_manmanmai(tasks = {}):
# sample url: https://detail.tmall.com/item.htm?id=573712976128&skuId=3750371993869
# encoded url: https%3A//detail.tmall.com/item.htm%3Fid%3D573712976128%26skuId%3D3750371993869
for product_id, url in tasks.items():
crawler_url = 'http://sapi.manmanbuy.com/searchAPI.ashx?method=searchapi_proinfobyurl&AppKey=FvHIWNzmCsZYY70C&url=%s' %(requests.utils.quote(url));
r = requests.get(crawler_url)
if r.status_code > 200:
print("Failed to get result at url: %s" %crawler_url)
continue
if __name__ == "__main__":
tasks = fetech_all_tasks()
write_to_file(tasks)