Showing
11 changed files
with
114 additions
and
54 deletions
@@ -18,6 +18,11 @@ public class SuggestionConstants { | @@ -18,6 +18,11 @@ public class SuggestionConstants { | ||
18 | 18 | ||
19 | public static final List<String> IGNORE_KEYWORDS = Arrays.asList("其他","正常","中文","中国","普通","2%"); | 19 | public static final List<String> IGNORE_KEYWORDS = Arrays.asList("其他","正常","中文","中国","普通","2%"); |
20 | 20 | ||
21 | + /** | ||
22 | + * 返回智能搜索词的数量 | ||
23 | + */ | ||
24 | + public static final int SMART_SUGGESTION_TERM_COUNT = 5; | ||
25 | + | ||
21 | public static String YOHO_SEARCH_KEYWORDS_HOT = "YOHO.SEARCH.KEYWORDS.HOT"; | 26 | public static String YOHO_SEARCH_KEYWORDS_HOT = "YOHO.SEARCH.KEYWORDS.HOT"; |
22 | 27 | ||
23 | 28 |
1 | -package com.yoho.search.spider.service; | 1 | +package com.yoho.search.spider.common; |
2 | 2 | ||
3 | import org.htmlcleaner.HtmlCleaner; | 3 | import org.htmlcleaner.HtmlCleaner; |
4 | import org.htmlcleaner.TagNode; | 4 | import org.htmlcleaner.TagNode; |
@@ -16,7 +16,7 @@ import java.net.URLConnection; | @@ -16,7 +16,7 @@ import java.net.URLConnection; | ||
16 | * Created by ginozhang on 2017/2/28. | 16 | * Created by ginozhang on 2017/2/28. |
17 | */ | 17 | */ |
18 | @Component | 18 | @Component |
19 | -public class HttpRequestService { | 19 | +public class SpiderBasedHttpRequest { |
20 | 20 | ||
21 | private static final Logger LOGGER = LoggerFactory.getLogger("SEARCH_SPIDER"); | 21 | private static final Logger LOGGER = LoggerFactory.getLogger("SEARCH_SPIDER"); |
22 | 22 | ||
@@ -120,7 +120,7 @@ public class HttpRequestService { | @@ -120,7 +120,7 @@ public class HttpRequestService { | ||
120 | } | 120 | } |
121 | 121 | ||
122 | public static void main(String[] args) throws XPatherException { | 122 | public static void main(String[] args) throws XPatherException { |
123 | - HttpRequestService service = new HttpRequestService(); | 123 | + SpiderBasedHttpRequest service = new SpiderBasedHttpRequest(); |
124 | String url = "http://baike.baidu.com/item/vans"; | 124 | String url = "http://baike.baidu.com/item/vans"; |
125 | BaikeBO baikeBO = service.get(url); | 125 | BaikeBO baikeBO = service.get(url); |
126 | System.out.println(baikeBO); | 126 | System.out.println(baikeBO); |
@@ -2,7 +2,8 @@ package com.yoho.search.spider.controller; | @@ -2,7 +2,8 @@ package com.yoho.search.spider.controller; | ||
2 | 2 | ||
3 | import com.yoho.search.spider.conversation.SuggestConvertorService; | 3 | import com.yoho.search.spider.conversation.SuggestConvertorService; |
4 | import com.yoho.search.spider.job.SpiderJob; | 4 | import com.yoho.search.spider.job.SpiderJob; |
5 | -import com.yoho.search.spider.service.BaikeSpiderService; | 5 | +import com.yoho.search.spider.full.BaikeURLSpiderService; |
6 | +import com.yoho.search.spider.increment.BlackKeywordsMgr; | ||
6 | import org.springframework.beans.factory.annotation.Autowired; | 7 | import org.springframework.beans.factory.annotation.Autowired; |
7 | import org.springframework.stereotype.Controller; | 8 | import org.springframework.stereotype.Controller; |
8 | import org.springframework.web.bind.annotation.RequestMapping; | 9 | import org.springframework.web.bind.annotation.RequestMapping; |
@@ -19,7 +20,7 @@ import java.util.Map; | @@ -19,7 +20,7 @@ import java.util.Map; | ||
19 | public class BaikeSpiderController { | 20 | public class BaikeSpiderController { |
20 | 21 | ||
21 | @Autowired | 22 | @Autowired |
22 | - private BaikeSpiderService baikeSpiderService; | 23 | + private BaikeURLSpiderService baikeURLSpiderService; |
23 | 24 | ||
24 | @Autowired | 25 | @Autowired |
25 | private SpiderJob spiderJob; | 26 | private SpiderJob spiderJob; |
@@ -27,11 +28,14 @@ public class BaikeSpiderController { | @@ -27,11 +28,14 @@ public class BaikeSpiderController { | ||
27 | @Autowired | 28 | @Autowired |
28 | private SuggestConvertorService suggestConvertorService; | 29 | private SuggestConvertorService suggestConvertorService; |
29 | 30 | ||
31 | + @Autowired | ||
32 | + private BlackKeywordsMgr blackKeywordsMgr; | ||
33 | + | ||
30 | @RequestMapping(value = "/spider/baike") | 34 | @RequestMapping(value = "/spider/baike") |
31 | @ResponseBody | 35 | @ResponseBody |
32 | public Map<String, Object> baikeSpider() { | 36 | public Map<String, Object> baikeSpider() { |
33 | try { | 37 | try { |
34 | - Map<String, Object> result = baikeSpiderService.start(); | 38 | + Map<String, Object> result = baikeURLSpiderService.start(); |
35 | result.put("code", 200); | 39 | result.put("code", 200); |
36 | result.put("message", "success"); | 40 | result.put("message", "success"); |
37 | return result; | 41 | return result; |
@@ -94,4 +98,21 @@ public class BaikeSpiderController { | @@ -94,4 +98,21 @@ public class BaikeSpiderController { | ||
94 | return result; | 98 | return result; |
95 | } | 99 | } |
96 | } | 100 | } |
101 | + | ||
102 | + @RequestMapping(value = "/spider/blackList") | ||
103 | + @ResponseBody | ||
104 | + public Map<String, Object> blackList(@RequestParam String keyword) { | ||
105 | + Map<String, Object> result = new HashMap(); | ||
106 | + | ||
107 | + try { | ||
108 | + result.put("data", blackKeywordsMgr.getBlackKeywords()); | ||
109 | + result.put("code", 200); | ||
110 | + result.put("message", "success"); | ||
111 | + return result; | ||
112 | + } catch (Exception e) { | ||
113 | + result.put("code", 503); | ||
114 | + result.put("message", e.getMessage()); | ||
115 | + return result; | ||
116 | + } | ||
117 | + } | ||
97 | } | 118 | } |
@@ -25,8 +25,6 @@ public class SuggestConvertorService { | @@ -25,8 +25,6 @@ public class SuggestConvertorService { | ||
25 | 25 | ||
26 | private static final String ANALYZER = "ik_smart"; | 26 | private static final String ANALYZER = "ik_smart"; |
27 | 27 | ||
28 | - private static final int MAX_KEYWORDS = 5; | ||
29 | - | ||
30 | @Autowired | 28 | @Autowired |
31 | private AnalyzerHelper analyzerHelper; | 29 | private AnalyzerHelper analyzerHelper; |
32 | 30 | ||
@@ -66,6 +64,7 @@ public class SuggestConvertorService { | @@ -66,6 +64,7 @@ public class SuggestConvertorService { | ||
66 | } | 64 | } |
67 | } | 65 | } |
68 | 66 | ||
67 | + logger.info("[func=buildYohoKeywordBO][yohoKeywordsSize={}]", yohoKeywords.size()); | ||
69 | return new YohoKeywordsBO(yohoKeywords, filterSortNameSet); | 68 | return new YohoKeywordsBO(yohoKeywords, filterSortNameSet); |
70 | } | 69 | } |
71 | 70 | ||
@@ -133,17 +132,19 @@ public class SuggestConvertorService { | @@ -133,17 +132,19 @@ public class SuggestConvertorService { | ||
133 | return null; | 132 | return null; |
134 | } | 133 | } |
135 | 134 | ||
136 | - logger.trace("[func=SuggestConversionFlow.parse][subject={}][matchResult={}]", spiderContent.getSubject(), matchResult); | 135 | + logger.trace("[func=SuggestConvertorService.parse][subject={}][matchResult={}]", spiderContent.getSubject(), matchResult); |
137 | List<String> sortedKeywords = com.yoho.search.base.utils.CollectionUtils.getSortedKeys(matchResult, false); | 136 | List<String> sortedKeywords = com.yoho.search.base.utils.CollectionUtils.getSortedKeys(matchResult, false); |
138 | 137 | ||
139 | // 如果可以 总是让品类名在第一个 | 138 | // 如果可以 总是让品类名在第一个 |
140 | String firstSortName = null; | 139 | String firstSortName = null; |
141 | for (String keyword : sortedKeywords) { | 140 | for (String keyword : sortedKeywords) { |
142 | - firstSortName = keyword; | ||
143 | - break; | 141 | + if (yohoKeywordsBO.getSortNameMap().contains(keyword)) { |
142 | + firstSortName = keyword; | ||
143 | + break; | ||
144 | + } | ||
144 | } | 145 | } |
145 | 146 | ||
146 | - List<String> resultKeywordList = new ArrayList<>(5); | 147 | + List<String> resultKeywordList = new ArrayList<>(SuggestionConstants.SMART_SUGGESTION_TERM_COUNT); |
147 | if (firstSortName != null) { | 148 | if (firstSortName != null) { |
148 | resultKeywordList.add(firstSortName); | 149 | resultKeywordList.add(firstSortName); |
149 | } | 150 | } |
@@ -151,7 +152,7 @@ public class SuggestConvertorService { | @@ -151,7 +152,7 @@ public class SuggestConvertorService { | ||
151 | for (String keyword : sortedKeywords) { | 152 | for (String keyword : sortedKeywords) { |
152 | if (keyword != null && !keyword.equalsIgnoreCase(firstSortName)) { | 153 | if (keyword != null && !keyword.equalsIgnoreCase(firstSortName)) { |
153 | resultKeywordList.add(keyword); | 154 | resultKeywordList.add(keyword); |
154 | - if (resultKeywordList.size() == MAX_KEYWORDS) { | 155 | + if (resultKeywordList.size() == SuggestionConstants.SMART_SUGGESTION_TERM_COUNT) { |
155 | break; | 156 | break; |
156 | } | 157 | } |
157 | } | 158 | } |
1 | -package com.yoho.search.spider.service; | 1 | +package com.yoho.search.spider.full; |
2 | 2 | ||
3 | import com.yoho.search.base.utils.ApplicationContextUtil; | 3 | import com.yoho.search.base.utils.ApplicationContextUtil; |
4 | import com.yoho.search.consumer.service.base.SpiderContentService; | 4 | import com.yoho.search.consumer.service.base.SpiderContentService; |
5 | import com.yoho.search.dal.model.SpiderContent; | 5 | import com.yoho.search.dal.model.SpiderContent; |
6 | +import com.yoho.search.spider.common.BaikeBO; | ||
6 | import org.slf4j.Logger; | 7 | import org.slf4j.Logger; |
7 | import org.slf4j.LoggerFactory; | 8 | import org.slf4j.LoggerFactory; |
8 | 9 |
1 | -package com.yoho.search.spider.service; | 1 | +package com.yoho.search.spider.full; |
2 | 2 | ||
3 | import com.yoho.search.base.utils.FileUtils; | 3 | import com.yoho.search.base.utils.FileUtils; |
4 | +import com.yoho.search.spider.common.BaikeBO; | ||
5 | +import com.yoho.search.spider.common.SpiderBasedHttpRequest; | ||
4 | import org.apache.commons.collections.CollectionUtils; | 6 | import org.apache.commons.collections.CollectionUtils; |
5 | import org.slf4j.Logger; | 7 | import org.slf4j.Logger; |
6 | import org.slf4j.LoggerFactory; | 8 | import org.slf4j.LoggerFactory; |
@@ -18,12 +20,12 @@ public class BaikeURLCrawler implements Callable<Integer> { | @@ -18,12 +20,12 @@ public class BaikeURLCrawler implements Callable<Integer> { | ||
18 | 20 | ||
19 | private final Set<String> existSubjects; | 21 | private final Set<String> existSubjects; |
20 | 22 | ||
21 | - private final HttpRequestService httpRequestService; | 23 | + private final SpiderBasedHttpRequest spiderBasedHttpRequest; |
22 | 24 | ||
23 | - public BaikeURLCrawler(File file, Set<String> existSubjects, HttpRequestService httpRequestService) { | 25 | + public BaikeURLCrawler(File file, Set<String> existSubjects, SpiderBasedHttpRequest spiderBasedHttpRequest) { |
24 | this.file = file; | 26 | this.file = file; |
25 | this.existSubjects = existSubjects; | 27 | this.existSubjects = existSubjects; |
26 | - this.httpRequestService = httpRequestService; | 28 | + this.spiderBasedHttpRequest = spiderBasedHttpRequest; |
27 | } | 29 | } |
28 | 30 | ||
29 | @Override | 31 | @Override |
@@ -52,7 +54,7 @@ public class BaikeURLCrawler implements Callable<Integer> { | @@ -52,7 +54,7 @@ public class BaikeURLCrawler implements Callable<Integer> { | ||
52 | try { | 54 | try { |
53 | if (!this.existSubjects.contains(title.toLowerCase().trim())) { | 55 | if (!this.existSubjects.contains(title.toLowerCase().trim())) { |
54 | LOGGER.info("[func=BaikeURLCrawler][title={}][url={}]", title, url); | 56 | LOGGER.info("[func=BaikeURLCrawler][title={}][url={}]", title, url); |
55 | - BaikeBO baikeBO = httpRequestService.get(url); | 57 | + BaikeBO baikeBO = spiderBasedHttpRequest.get(url); |
56 | if (baikeBO != null) { | 58 | if (baikeBO != null) { |
57 | baikeBO.setTitle(title); | 59 | baikeBO.setTitle(title); |
58 | BaikeBOBulkService.submitBaike(baikeBO); | 60 | BaikeBOBulkService.submitBaike(baikeBO); |
1 | -package com.yoho.search.spider.service; | 1 | +package com.yoho.search.spider.full; |
2 | 2 | ||
3 | -import com.yoho.search.consumer.service.base.*; | 3 | +import com.yoho.search.consumer.service.base.SpiderContentService; |
4 | +import com.yoho.search.spider.common.SpiderBasedHttpRequest; | ||
4 | import org.slf4j.Logger; | 5 | import org.slf4j.Logger; |
5 | import org.slf4j.LoggerFactory; | 6 | import org.slf4j.LoggerFactory; |
6 | import org.springframework.beans.factory.annotation.Autowired; | 7 | import org.springframework.beans.factory.annotation.Autowired; |
@@ -18,7 +19,7 @@ import java.util.concurrent.atomic.AtomicInteger; | @@ -18,7 +19,7 @@ import java.util.concurrent.atomic.AtomicInteger; | ||
18 | * Created by ginozhang on 2017/3/1. | 19 | * Created by ginozhang on 2017/3/1. |
19 | */ | 20 | */ |
20 | @Component | 21 | @Component |
21 | -public class BaikeSpiderService { | 22 | +public class BaikeURLSpiderService { |
22 | 23 | ||
23 | private static final Logger LOGGER = LoggerFactory.getLogger("SEARCH_SPIDER"); | 24 | private static final Logger LOGGER = LoggerFactory.getLogger("SEARCH_SPIDER"); |
24 | 25 | ||
@@ -30,14 +31,14 @@ public class BaikeSpiderService { | @@ -30,14 +31,14 @@ public class BaikeSpiderService { | ||
30 | private SpiderContentService spiderContentService; | 31 | private SpiderContentService spiderContentService; |
31 | 32 | ||
32 | @Autowired | 33 | @Autowired |
33 | - private HttpRequestService httpRequestService; | 34 | + private SpiderBasedHttpRequest spiderBasedHttpRequest; |
34 | 35 | ||
35 | public synchronized Map<String, Object> start() { | 36 | public synchronized Map<String, Object> start() { |
36 | long begin = System.currentTimeMillis(); | 37 | long begin = System.currentTimeMillis(); |
37 | LOGGER.info("[func=BaikeSpiderService.start][begin={}]", begin); | 38 | LOGGER.info("[func=BaikeSpiderService.start][begin={}]", begin); |
38 | Map<String, Object> result = new HashMap(); | 39 | Map<String, Object> result = new HashMap(); |
39 | 40 | ||
40 | - String baikeDir = BaikeSpiderService.class.getResource("/baike").getPath(); | 41 | + String baikeDir = BaikeURLSpiderService.class.getResource("/baike").getPath(); |
41 | File baikeDirFile = new File(baikeDir); | 42 | File baikeDirFile = new File(baikeDir); |
42 | if (!baikeDirFile.exists() || !baikeDirFile.isDirectory()) { | 43 | if (!baikeDirFile.exists() || !baikeDirFile.isDirectory()) { |
43 | throw new RuntimeException("Cannot find baike directory. baikeDir=" + baikeDir); | 44 | throw new RuntimeException("Cannot find baike directory. baikeDir=" + baikeDir); |
@@ -57,7 +58,7 @@ public class BaikeSpiderService { | @@ -57,7 +58,7 @@ public class BaikeSpiderService { | ||
57 | ExecutorService pool = Executors.newFixedThreadPool(POOL_SIZE, thread -> new Thread(thread, "BaikeSpider-" + atomicInteger.getAndIncrement())); | 58 | ExecutorService pool = Executors.newFixedThreadPool(POOL_SIZE, thread -> new Thread(thread, "BaikeSpider-" + atomicInteger.getAndIncrement())); |
58 | List<Future<Integer>> futures = new ArrayList<>(baikeUrlFiles.length); | 59 | List<Future<Integer>> futures = new ArrayList<>(baikeUrlFiles.length); |
59 | for (File baikeUrlFile : baikeUrlFiles) { | 60 | for (File baikeUrlFile : baikeUrlFiles) { |
60 | - futures.add(pool.submit(new BaikeURLCrawler(baikeUrlFile, existSubjects, httpRequestService))); | 61 | + futures.add(pool.submit(new BaikeURLCrawler(baikeUrlFile, existSubjects, spiderBasedHttpRequest))); |
61 | } | 62 | } |
62 | 63 | ||
63 | for (Future<Integer> future : futures) { | 64 | for (Future<Integer> future : futures) { |
1 | +package com.yoho.search.spider.increment; | ||
2 | + | ||
3 | +import com.yoho.core.redis.YHSetOperations; | ||
4 | +import com.yoho.search.base.utils.RedisKeys; | ||
5 | +import org.apache.commons.collections.CollectionUtils; | ||
6 | +import org.springframework.stereotype.Component; | ||
7 | + | ||
8 | +import javax.annotation.Resource; | ||
9 | +import java.util.List; | ||
10 | +import java.util.Set; | ||
11 | + | ||
12 | +/** | ||
13 | + * Created by ginozhang on 2017/3/3. | ||
14 | + * 爬虫黑名单管理,超过指定次数失败的加入该黑名单。 | ||
15 | + */ | ||
16 | +@Component | ||
17 | +public class BlackKeywordsMgr { | ||
18 | + | ||
19 | + @Resource(name = "yhNoSyncSetOperations") | ||
20 | + private YHSetOperations<String, String> yhNoSyncSetOperations; | ||
21 | + | ||
22 | + public Set<String> getBlackKeywords() { | ||
23 | + return yhNoSyncSetOperations.members(RedisKeys.YOHO_SEARCH_KEYWORDS_INVALID); | ||
24 | + } | ||
25 | + | ||
26 | + public void addBlackKeywords(List<String> failedKeywords) { | ||
27 | + if (CollectionUtils.isNotEmpty(failedKeywords)) { | ||
28 | + String[] keywordArray = new String[failedKeywords.size()]; | ||
29 | + yhNoSyncSetOperations.add(RedisKeys.YOHO_SEARCH_KEYWORDS_INVALID, failedKeywords.toArray(keywordArray)); | ||
30 | + } | ||
31 | + } | ||
32 | + | ||
33 | +} |
1 | -package com.yoho.search.spider.service; | 1 | +package com.yoho.search.spider.increment; |
2 | 2 | ||
3 | -import com.yoho.core.redis.YHSetOperations; | ||
4 | import com.yoho.core.redis.YHZSetOperations; | 3 | import com.yoho.core.redis.YHZSetOperations; |
5 | import com.yoho.search.base.utils.RedisKeys; | 4 | import com.yoho.search.base.utils.RedisKeys; |
6 | import com.yoho.search.consumer.index.common.AnalyzerHelper; | 5 | import com.yoho.search.consumer.index.common.AnalyzerHelper; |
@@ -8,6 +7,8 @@ import com.yoho.search.consumer.service.base.SpiderContentService; | @@ -8,6 +7,8 @@ import com.yoho.search.consumer.service.base.SpiderContentService; | ||
8 | import com.yoho.search.consumer.suggests.common.RetryBusinessFlow; | 7 | import com.yoho.search.consumer.suggests.common.RetryBusinessFlow; |
9 | import com.yoho.search.consumer.suggests.common.SuggestionConstants; | 8 | import com.yoho.search.consumer.suggests.common.SuggestionConstants; |
10 | import com.yoho.search.dal.model.SpiderContent; | 9 | import com.yoho.search.dal.model.SpiderContent; |
10 | +import com.yoho.search.spider.common.BaikeBO; | ||
11 | +import com.yoho.search.spider.common.SpiderBasedHttpRequest; | ||
11 | import org.apache.commons.collections.CollectionUtils; | 12 | import org.apache.commons.collections.CollectionUtils; |
12 | import org.apache.commons.lang.StringUtils; | 13 | import org.apache.commons.lang.StringUtils; |
13 | import org.slf4j.Logger; | 14 | import org.slf4j.Logger; |
@@ -30,7 +31,7 @@ import java.util.stream.Collectors; | @@ -30,7 +31,7 @@ import java.util.stream.Collectors; | ||
30 | * Created by ginozhang on 2017/3/1. | 31 | * Created by ginozhang on 2017/3/1. |
31 | */ | 32 | */ |
32 | @Component | 33 | @Component |
33 | -public class KeywordCrawlerFlow implements RetryBusinessFlow { | 34 | +public class IncrementCrawlerFlow implements RetryBusinessFlow { |
34 | 35 | ||
35 | private static final Logger logger = LoggerFactory.getLogger("FLOW_EXECUTOR"); | 36 | private static final Logger logger = LoggerFactory.getLogger("FLOW_EXECUTOR"); |
36 | 37 | ||
@@ -39,9 +40,6 @@ public class KeywordCrawlerFlow implements RetryBusinessFlow { | @@ -39,9 +40,6 @@ public class KeywordCrawlerFlow implements RetryBusinessFlow { | ||
39 | @Resource(name = "yhNoSyncZSetOperations") | 40 | @Resource(name = "yhNoSyncZSetOperations") |
40 | private YHZSetOperations<String, String> yhNoSyncZSetOperations; | 41 | private YHZSetOperations<String, String> yhNoSyncZSetOperations; |
41 | 42 | ||
42 | - @Resource(name = "yhNoSyncSetOperations") | ||
43 | - private YHSetOperations<String, String> yhNoSyncSetOperations; | ||
44 | - | ||
45 | @Autowired | 43 | @Autowired |
46 | private SpiderContentService spiderContentService; | 44 | private SpiderContentService spiderContentService; |
47 | 45 | ||
@@ -49,7 +47,10 @@ public class KeywordCrawlerFlow implements RetryBusinessFlow { | @@ -49,7 +47,10 @@ public class KeywordCrawlerFlow implements RetryBusinessFlow { | ||
49 | private AnalyzerHelper analyzerHelper; | 47 | private AnalyzerHelper analyzerHelper; |
50 | 48 | ||
51 | @Autowired | 49 | @Autowired |
52 | - private HttpRequestService httpRequestService; | 50 | + private SpiderBasedHttpRequest spiderBasedHttpRequest; |
51 | + | ||
52 | + @Autowired | ||
53 | + private BlackKeywordsMgr blackKeywordsMgr; | ||
53 | 54 | ||
54 | private List<String> validKeywordList = null; | 55 | private List<String> validKeywordList = null; |
55 | 56 | ||
@@ -71,24 +72,23 @@ public class KeywordCrawlerFlow implements RetryBusinessFlow { | @@ -71,24 +72,23 @@ public class KeywordCrawlerFlow implements RetryBusinessFlow { | ||
71 | keywordSet.add(typedTuple.getValue()); | 72 | keywordSet.add(typedTuple.getValue()); |
72 | } | 73 | } |
73 | 74 | ||
74 | - logger.info("[func=KeywordCrawlerFlow.init][keywordSetSize={}]", keywordSet.size()); | 75 | + logger.info("[func=IncrementCrawlerFlow.init][keywordSetSize={}]", keywordSet.size()); |
75 | if (keywordSet.isEmpty()) { | 76 | if (keywordSet.isEmpty()) { |
76 | return; | 77 | return; |
77 | } | 78 | } |
78 | 79 | ||
79 | Set<String> existSubjects = spiderContentService.getAllSubjects(); | 80 | Set<String> existSubjects = spiderContentService.getAllSubjects(); |
80 | - keywordSet.removeAll(existSubjects); | ||
81 | - logger.info("[func=KeywordCrawlerFlow.init][keywordSetSizeRemovedExist={}]", keywordSet.size()); | 81 | + keywordSet = keywordSet.stream().filter(keyword -> !existSubjects.contains(keyword.toLowerCase().trim())).collect(Collectors.toSet()); |
82 | + logger.info("[func=IncrementCrawlerFlow.init][keywordSetSizeRemovedExist={}]", keywordSet.size()); | ||
82 | 83 | ||
83 | - Set<String> invalidKeywords = yhNoSyncSetOperations.members(RedisKeys.YOHO_SEARCH_KEYWORDS_INVALID); | 84 | + Set<String> invalidKeywords = blackKeywordsMgr.getBlackKeywords(); |
84 | if (CollectionUtils.isNotEmpty(invalidKeywords)) { | 85 | if (CollectionUtils.isNotEmpty(invalidKeywords)) { |
85 | keywordSet = keywordSet.stream().filter(keyword -> !invalidKeywords.contains(keyword.toLowerCase().trim())).collect(Collectors.toSet()); | 86 | keywordSet = keywordSet.stream().filter(keyword -> !invalidKeywords.contains(keyword.toLowerCase().trim())).collect(Collectors.toSet()); |
86 | } | 87 | } |
87 | 88 | ||
88 | - logger.info("[func=KeywordCrawlerFlow.init][keywordSetSizeRemovedInvalid={}]", keywordSet.size()); | ||
89 | - | 89 | + logger.info("[func=IncrementCrawlerFlow.init][keywordSetSizeRemovedInvalid={}]", keywordSet.size()); |
90 | this.validKeywordList = keywordSet.parallelStream().filter(keyword -> validKeyword(keyword)).collect(Collectors.toList()); | 90 | this.validKeywordList = keywordSet.parallelStream().filter(keyword -> validKeyword(keyword)).collect(Collectors.toList()); |
91 | - logger.info("[func=KeywordCrawlerFlow.init][validKeywordListSize={}]", validKeywordList != null ? validKeywordList.size() : 0); | 91 | + logger.info("[func=IncrementCrawlerFlow.init][validKeywordListSize={}]", validKeywordList != null ? validKeywordList.size() : 0); |
92 | } | 92 | } |
93 | 93 | ||
94 | private boolean validKeyword(String keyword) { | 94 | private boolean validKeyword(String keyword) { |
@@ -141,25 +141,21 @@ public class KeywordCrawlerFlow implements RetryBusinessFlow { | @@ -141,25 +141,21 @@ public class KeywordCrawlerFlow implements RetryBusinessFlow { | ||
141 | } | 141 | } |
142 | } | 142 | } |
143 | 143 | ||
144 | - logger.info("[func=KeywordCrawlerFlow.doBusiness][baikeBOListSize={}][failedKeywords={}]", baikeBOList.size(), failedKeywords); | 144 | + logger.info("[func=IncrementCrawlerFlow.doBusiness][baikeBOListSize={}][failedKeywords={}]", baikeBOList.size(), failedKeywords); |
145 | if (CollectionUtils.isNotEmpty(baikeBOList)) { | 145 | if (CollectionUtils.isNotEmpty(baikeBOList)) { |
146 | List<SpiderContent> spiderContentList = baikeBOList.stream().map(baikeBO -> baikeBO.toSpiderContent()).collect(Collectors.toList()); | 146 | List<SpiderContent> spiderContentList = baikeBOList.stream().map(baikeBO -> baikeBO.toSpiderContent()).collect(Collectors.toList()); |
147 | spiderContentService.insertBatch(spiderContentList); | 147 | spiderContentService.insertBatch(spiderContentList); |
148 | } | 148 | } |
149 | 149 | ||
150 | - if (!failedKeywords.isEmpty()) { | ||
151 | - String[] keywordArray = new String[failedKeywords.size()]; | ||
152 | - yhNoSyncSetOperations.add(RedisKeys.YOHO_SEARCH_KEYWORDS_INVALID, failedKeywords.toArray(keywordArray)); | ||
153 | - } | ||
154 | - | 150 | + blackKeywordsMgr.addBlackKeywords(failedKeywords); |
155 | return true; | 151 | return true; |
156 | } | 152 | } |
157 | 153 | ||
158 | private BaikeBO crawle(String keyword) { | 154 | private BaikeBO crawle(String keyword) { |
159 | try { | 155 | try { |
160 | String url = "http://baike.baidu.com/item/" + URLEncoder.encode(keyword, "UTF-8"); | 156 | String url = "http://baike.baidu.com/item/" + URLEncoder.encode(keyword, "UTF-8"); |
161 | - logger.info("[func=KeywordCrawlerFlow][keyword={}][url={}]", keyword, url); | ||
162 | - return httpRequestService.get(url); | 157 | + logger.info("[func=IncrementCrawlerFlow][keyword={}][url={}]", keyword, url); |
158 | + return spiderBasedHttpRequest.get(url); | ||
163 | } catch (Exception e) { | 159 | } catch (Exception e) { |
164 | logger.error("crawle keyword [" + keyword + "] failed!", e); | 160 | logger.error("crawle keyword [" + keyword + "] failed!", e); |
165 | } | 161 | } |
@@ -170,13 +166,13 @@ public class KeywordCrawlerFlow implements RetryBusinessFlow { | @@ -170,13 +166,13 @@ public class KeywordCrawlerFlow implements RetryBusinessFlow { | ||
170 | @Override | 166 | @Override |
171 | public void finish(boolean doBusinessResult, Exception exception) { | 167 | public void finish(boolean doBusinessResult, Exception exception) { |
172 | this.validKeywordList = null; | 168 | this.validKeywordList = null; |
173 | - logger.info("[func=KeywordCrawlerFlow.finish][doBusinessResult=" + doBusinessResult + "]", exception); | 169 | + logger.info("[func=IncrementCrawlerFlow.finish][doBusinessResult=" + doBusinessResult + "]", exception); |
174 | } | 170 | } |
175 | 171 | ||
176 | public static void main(String[] args) throws UnsupportedEncodingException { | 172 | public static void main(String[] args) throws UnsupportedEncodingException { |
177 | final String keyword = "华伦天奴"; | 173 | final String keyword = "华伦天奴"; |
178 | String url = "http://baike.baidu.com/item/" + URLEncoder.encode(keyword, "UTF-8"); | 174 | String url = "http://baike.baidu.com/item/" + URLEncoder.encode(keyword, "UTF-8"); |
179 | - logger.info("[func=KeywordCrawlerFlow][keyword={}][url={}]", keyword, url); | ||
180 | - System.out.println((new HttpRequestService()).get(url)); | 175 | + logger.info("[func=IncrementCrawlerFlow][keyword={}][url={}]", keyword, url); |
176 | + System.out.println((new SpiderBasedHttpRequest()).get(url)); | ||
181 | } | 177 | } |
182 | } | 178 | } |
@@ -3,7 +3,7 @@ package com.yoho.search.spider.job; | @@ -3,7 +3,7 @@ package com.yoho.search.spider.job; | ||
3 | import com.yoho.search.consumer.common.DynamicConfigService; | 3 | import com.yoho.search.consumer.common.DynamicConfigService; |
4 | import com.yoho.search.consumer.suggests.common.RetryBusinessFlowExecutor; | 4 | import com.yoho.search.consumer.suggests.common.RetryBusinessFlowExecutor; |
5 | import com.yoho.search.spider.conversation.SuggestConvertorFlow; | 5 | import com.yoho.search.spider.conversation.SuggestConvertorFlow; |
6 | -import com.yoho.search.spider.service.KeywordCrawlerFlow; | 6 | +import com.yoho.search.spider.increment.IncrementCrawlerFlow; |
7 | import org.slf4j.Logger; | 7 | import org.slf4j.Logger; |
8 | import org.slf4j.LoggerFactory; | 8 | import org.slf4j.LoggerFactory; |
9 | import org.springframework.beans.factory.annotation.Autowired; | 9 | import org.springframework.beans.factory.annotation.Autowired; |
@@ -19,7 +19,7 @@ public class SpiderJob { | @@ -19,7 +19,7 @@ public class SpiderJob { | ||
19 | private static final Logger LOGGER = LoggerFactory.getLogger("SEARCH_SPIDER"); | 19 | private static final Logger LOGGER = LoggerFactory.getLogger("SEARCH_SPIDER"); |
20 | 20 | ||
21 | @Autowired | 21 | @Autowired |
22 | - private KeywordCrawlerFlow keywordCrawlerFlow; | 22 | + private IncrementCrawlerFlow incrementCrawlerFlow; |
23 | 23 | ||
24 | @Autowired | 24 | @Autowired |
25 | private SuggestConvertorFlow suggestConvertorFlow; | 25 | private SuggestConvertorFlow suggestConvertorFlow; |
@@ -37,7 +37,7 @@ public class SpiderJob { | @@ -37,7 +37,7 @@ public class SpiderJob { | ||
37 | return; | 37 | return; |
38 | } | 38 | } |
39 | 39 | ||
40 | - RetryBusinessFlowExecutor flowExecutor = new RetryBusinessFlowExecutor(keywordCrawlerFlow); | 40 | + RetryBusinessFlowExecutor flowExecutor = new RetryBusinessFlowExecutor(incrementCrawlerFlow); |
41 | boolean result = flowExecutor.execute(); | 41 | boolean result = flowExecutor.execute(); |
42 | LOGGER.info("[func=crawleEmptySearchKeywords.end][result={}][cost={}]", result, System.currentTimeMillis() - begin); | 42 | LOGGER.info("[func=crawleEmptySearchKeywords.end][result={}][cost={}]", result, System.currentTimeMillis() - begin); |
43 | } | 43 | } |
-
Please register or login to post a comment