Showing
4 changed files
with
157 additions
and
4 deletions
common/src/main/java/com/yoho/global/common/crawler/constants/CrawlerDestinationUrls.java
0 → 100644
1 | +/* | ||
2 | + * Copyright (C), 2016-2016, yoho | ||
3 | + * FileName: CrawlerDestinationUrls.java | ||
4 | + * Author: Maelk_liu | ||
5 | + * Date: 2016年11月28日 下午1:49:54 | ||
6 | + * Description: //模块目的、功能描述 | ||
7 | + * History: //修改记录 | ||
8 | + * <author> <time> <version> <description> | ||
9 | + * 修改人姓名 修改时间 版本号 描述 | ||
10 | + */ | ||
11 | +package com.yoho.global.common.crawler.constants; | ||
12 | + | ||
13 | +/** | ||
14 | + * 〈一句话功能简述〉<br> | ||
15 | + * 〈功能详细描述〉 | ||
16 | + * | ||
17 | + * @author Maelk_liu | ||
18 | + * @see [相关类/方法](可选) | ||
19 | + * @since [产品/模块版本] (可选) | ||
20 | + */ | ||
21 | +public interface CrawlerDestinationUrls { | ||
22 | + | ||
23 | + /*'http://zozo.jp' => '\Zozojp', | ||
24 | + 'http://www.stylife.co.jp' => '\Stylife', | ||
25 | + 'http://www.a-land.co.kr'=> '\Aland', | ||
26 | + 'http://www.mrporter.com|https://www.mrporter.com' => '\Mrporter', yaoxiaofeng | ||
27 | + 'http://www.stylife.co.jp' => '\Stylife', | ||
28 | + 'http://www.sneakersnstuff.com'=>'\Sneakersnstuff',wuxiao | ||
29 | + 'http://www.zappos.com'=>'\Zappos', yaoxiaofeng | ||
30 | + 'http://us.asos.com'=>'\Asos', yaoxiaofeng | ||
31 | + 'https://www.shopbop.com'=>'\Shopbop', | ||
32 | + 'http://www.endclothing.com|http://www.endclothing.co.uk' => '\Endclothing', | ||
33 | + 'http://www.supremenewyork.com' => '\Supreme', | ||
34 | + 'http://shop.visvim.tv|https://shop.visvim.tv' => '\Visvim', | ||
35 | + 'http://www.asos.com'=>'\Asosuk', wuxiao | ||
36 | + 'http://cn.alandglobal.com'=>'\Aland', wuxiao*/ | ||
37 | + public static final String ZOZO_URL="http://zozo.jp"; | ||
38 | + | ||
39 | +} |
@@ -30,6 +30,7 @@ public class MyCousonloPipeline implements Pipeline<SpiderBean> { | @@ -30,6 +30,7 @@ public class MyCousonloPipeline implements Pipeline<SpiderBean> { | ||
30 | if(bean.getClass().isAssignableFrom(MyGithub.class)){ | 30 | if(bean.getClass().isAssignableFrom(MyGithub.class)){ |
31 | MyGithub mygit =(MyGithub)bean; | 31 | MyGithub mygit =(MyGithub)bean; |
32 | System.out.println(mygit.getUser()); | 32 | System.out.println(mygit.getUser()); |
33 | + System.out.println(mygit.getReadme()); | ||
33 | } | 34 | } |
34 | //System.out.println(JSON.toJSONString(bean)); | 35 | //System.out.println(JSON.toJSONString(bean)); |
35 | } | 36 | } |
@@ -3,6 +3,7 @@ package com.yoho.global.crawler; | @@ -3,6 +3,7 @@ package com.yoho.global.crawler; | ||
3 | import com.geccocrawler.gecco.GeccoEngine; | 3 | import com.geccocrawler.gecco.GeccoEngine; |
4 | import com.geccocrawler.gecco.annotation.Gecco; | 4 | import com.geccocrawler.gecco.annotation.Gecco; |
5 | import com.geccocrawler.gecco.annotation.Href; | 5 | import com.geccocrawler.gecco.annotation.Href; |
6 | +import com.geccocrawler.gecco.annotation.Html; | ||
6 | import com.geccocrawler.gecco.annotation.HtmlField; | 7 | import com.geccocrawler.gecco.annotation.HtmlField; |
7 | import com.geccocrawler.gecco.annotation.Request; | 8 | import com.geccocrawler.gecco.annotation.Request; |
8 | import com.geccocrawler.gecco.annotation.RequestParameter; | 9 | import com.geccocrawler.gecco.annotation.RequestParameter; |
@@ -42,8 +43,25 @@ public class MyGithub implements HtmlBean { | @@ -42,8 +43,25 @@ public class MyGithub implements HtmlBean { | ||
42 | 43 | ||
43 | @HtmlField(cssPath=".entry-content") | 44 | @HtmlField(cssPath=".entry-content") |
44 | private String readme; | 45 | private String readme; |
45 | - | ||
46 | - public HttpRequest getRequest() { | 46 | + |
47 | + @Html | ||
48 | + private String htmlcontent; | ||
49 | + | ||
50 | + /** | ||
51 | + * @return the htmlcontent | ||
52 | + */ | ||
53 | + public String getHtmlcontent() { | ||
54 | + return htmlcontent; | ||
55 | + } | ||
56 | + | ||
57 | + /** | ||
58 | + * @param htmlcontent the htmlcontent to set | ||
59 | + */ | ||
60 | + public void setHtmlcontent(String htmlcontent) { | ||
61 | + this.htmlcontent = htmlcontent; | ||
62 | + } | ||
63 | + | ||
64 | + public HttpRequest getRequest() { | ||
47 | return request; | 65 | return request; |
48 | } | 66 | } |
49 | 67 | ||
@@ -109,7 +127,7 @@ public class MyGithub implements HtmlBean { | @@ -109,7 +127,7 @@ public class MyGithub implements HtmlBean { | ||
109 | 127 | ||
110 | public static void main(String[] args) { | 128 | public static void main(String[] args) { |
111 | GeccoEngine.create() | 129 | GeccoEngine.create() |
112 | - .classpath("com.geccocrawler.gecco.demo") | 130 | + .classpath("com.yoho.global.crawler") |
113 | //开始抓取的页面地址 | 131 | //开始抓取的页面地址 |
114 | .start("https://github.com/xtuhcy/gecco") | 132 | .start("https://github.com/xtuhcy/gecco") |
115 | .start("https://github.com/xtuhcy/gecco-spring") | 133 | .start("https://github.com/xtuhcy/gecco-spring") |
@@ -118,7 +136,7 @@ public class MyGithub implements HtmlBean { | @@ -118,7 +136,7 @@ public class MyGithub implements HtmlBean { | ||
118 | //单个爬虫每次抓取完一个请求后的间隔时间 | 136 | //单个爬虫每次抓取完一个请求后的间隔时间 |
119 | .interval(2000) | 137 | .interval(2000) |
120 | //循环抓取 | 138 | //循环抓取 |
121 | - .loop(true) | 139 | + .loop(false) |
122 | //采用pc端userAgent | 140 | //采用pc端userAgent |
123 | .mobile(false) | 141 | .mobile(false) |
124 | //是否开启debug模式,跟踪页面元素抽取 | 142 | //是否开启debug模式,跟踪页面元素抽取 |
1 | +/* | ||
2 | + * Copyright (C), 2016-2016, yoho | ||
3 | + * FileName: CrawlerContentService.java | ||
4 | + * Author: Maelk_liu | ||
5 | + * Date: 2016年11月28日 下午1:55:34 | ||
6 | + * Description: //模块目的、功能描述 | ||
7 | + * History: //修改记录 | ||
8 | + * <author> <time> <version> <description> | ||
9 | + * 修改人姓名 修改时间 版本号 描述 | ||
10 | + */ | ||
11 | +package com.yoho.global.crawler.service; | ||
12 | + | ||
13 | +import java.util.List; | ||
14 | + | ||
15 | +/** | ||
16 | + * 内容抓取接口 | ||
17 | + * | ||
18 | + * @author Maelk_liu | ||
19 | + * @see [相关类/方法](可选) | ||
20 | + * @since [产品/模块版本] (可选) | ||
21 | + */ | ||
22 | +public interface CrawlerContentService { | ||
23 | + | ||
24 | + | ||
25 | + /** | ||
26 | + * 返回抓取到的html源代码 | ||
27 | + * @return type | ||
28 | + */ | ||
29 | + public String getHtmlSource(); | ||
30 | + | ||
31 | + /** | ||
32 | + * 获取标题 | ||
33 | + * @return string | ||
34 | + */ | ||
35 | + public String getTitle(); | ||
36 | + | ||
37 | + /** | ||
38 | + * 获取关键字 | ||
39 | + * @return string | ||
40 | + */ | ||
41 | + public String getKeywords(); | ||
42 | + | ||
43 | + /** | ||
44 | + * 获取描述 | ||
45 | + * @return string | ||
46 | + */ | ||
47 | + public String getDescription(); | ||
48 | + | ||
49 | + /** | ||
50 | + * 获取品牌名 | ||
51 | + */ | ||
52 | + public String getBrand(); | ||
53 | + | ||
54 | + /** | ||
55 | + * 获取商品名 | ||
56 | + */ | ||
57 | + public String getName(); | ||
58 | + | ||
59 | + /** | ||
60 | + * 获取价格 | ||
61 | + * @param string color 目标网站的颜色 | ||
62 | + * @param string size 目标网站的尺码 | ||
63 | + * @param string goodsn 目标网站的商品号 | ||
64 | + */ | ||
65 | + public String getPrice(String color, String size, String goodsn); | ||
66 | + | ||
67 | + /** | ||
68 | + * 获取是否支持海外配送 | ||
69 | + */ | ||
70 | + public String getOverseas(); | ||
71 | + | ||
72 | + /** | ||
73 | + * 获取商品型号信息 | ||
74 | + */ | ||
75 | + public String getModel(); | ||
76 | + | ||
77 | + /** | ||
78 | + * 获取库存 | ||
79 | + * @param string color 目标网站的颜色 | ||
80 | + * @param string size 目标网站的尺码 | ||
81 | + * @param string goodsn 目标网站的商品号 | ||
82 | + */ | ||
83 | + public String getStock(String color, String size, String goodsn); | ||
84 | + | ||
85 | + /** | ||
86 | + * 获取图片 | ||
87 | + */ | ||
88 | + public List<String> getPics(); | ||
89 | + | ||
90 | + /** | ||
91 | + * 获取商品详情 | ||
92 | + */ | ||
93 | + public String getContent(); | ||
94 | + | ||
95 | +} |
-
Please register or login to post a comment