Authored by Gino Zhang

优化代码结构

@@ -18,6 +18,11 @@ public class SuggestionConstants { @@ -18,6 +18,11 @@ public class SuggestionConstants {
18 18
19 public static final List<String> IGNORE_KEYWORDS = Arrays.asList("其他","正常","中文","中国","普通","2%"); 19 public static final List<String> IGNORE_KEYWORDS = Arrays.asList("其他","正常","中文","中国","普通","2%");
20 20
  21 + /**
  22 + * 返回智能搜索词的数量
  23 + */
  24 + public static final int SMART_SUGGESTION_TERM_COUNT = 5;
  25 +
21 public static String YOHO_SEARCH_KEYWORDS_HOT = "YOHO.SEARCH.KEYWORDS.HOT"; 26 public static String YOHO_SEARCH_KEYWORDS_HOT = "YOHO.SEARCH.KEYWORDS.HOT";
22 27
23 28
1 -package com.yoho.search.spider.service; 1 +package com.yoho.search.spider.common;
2 2
3 import com.yoho.search.base.utils.DateUtil; 3 import com.yoho.search.base.utils.DateUtil;
4 import com.yoho.search.dal.model.SpiderContent; 4 import com.yoho.search.dal.model.SpiderContent;
1 -package com.yoho.search.spider.service; 1 +package com.yoho.search.spider.common;
2 2
3 import org.htmlcleaner.HtmlCleaner; 3 import org.htmlcleaner.HtmlCleaner;
4 import org.htmlcleaner.TagNode; 4 import org.htmlcleaner.TagNode;
@@ -16,7 +16,7 @@ import java.net.URLConnection; @@ -16,7 +16,7 @@ import java.net.URLConnection;
16 * Created by ginozhang on 2017/2/28. 16 * Created by ginozhang on 2017/2/28.
17 */ 17 */
18 @Component 18 @Component
19 -public class HttpRequestService { 19 +public class SpiderBasedHttpRequest {
20 20
21 private static final Logger LOGGER = LoggerFactory.getLogger("SEARCH_SPIDER"); 21 private static final Logger LOGGER = LoggerFactory.getLogger("SEARCH_SPIDER");
22 22
@@ -120,7 +120,7 @@ public class HttpRequestService { @@ -120,7 +120,7 @@ public class HttpRequestService {
120 } 120 }
121 121
122 public static void main(String[] args) throws XPatherException { 122 public static void main(String[] args) throws XPatherException {
123 - HttpRequestService service = new HttpRequestService(); 123 + SpiderBasedHttpRequest service = new SpiderBasedHttpRequest();
124 String url = "http://baike.baidu.com/item/vans"; 124 String url = "http://baike.baidu.com/item/vans";
125 BaikeBO baikeBO = service.get(url); 125 BaikeBO baikeBO = service.get(url);
126 System.out.println(baikeBO); 126 System.out.println(baikeBO);
@@ -2,7 +2,8 @@ package com.yoho.search.spider.controller; @@ -2,7 +2,8 @@ package com.yoho.search.spider.controller;
2 2
3 import com.yoho.search.spider.conversation.SuggestConvertorService; 3 import com.yoho.search.spider.conversation.SuggestConvertorService;
4 import com.yoho.search.spider.job.SpiderJob; 4 import com.yoho.search.spider.job.SpiderJob;
5 -import com.yoho.search.spider.service.BaikeSpiderService; 5 +import com.yoho.search.spider.full.BaikeURLSpiderService;
  6 +import com.yoho.search.spider.increment.BlackKeywordsMgr;
6 import org.springframework.beans.factory.annotation.Autowired; 7 import org.springframework.beans.factory.annotation.Autowired;
7 import org.springframework.stereotype.Controller; 8 import org.springframework.stereotype.Controller;
8 import org.springframework.web.bind.annotation.RequestMapping; 9 import org.springframework.web.bind.annotation.RequestMapping;
@@ -19,7 +20,7 @@ import java.util.Map; @@ -19,7 +20,7 @@ import java.util.Map;
19 public class BaikeSpiderController { 20 public class BaikeSpiderController {
20 21
21 @Autowired 22 @Autowired
22 - private BaikeSpiderService baikeSpiderService; 23 + private BaikeURLSpiderService baikeURLSpiderService;
23 24
24 @Autowired 25 @Autowired
25 private SpiderJob spiderJob; 26 private SpiderJob spiderJob;
@@ -27,11 +28,14 @@ public class BaikeSpiderController { @@ -27,11 +28,14 @@ public class BaikeSpiderController {
27 @Autowired 28 @Autowired
28 private SuggestConvertorService suggestConvertorService; 29 private SuggestConvertorService suggestConvertorService;
29 30
  31 + @Autowired
  32 + private BlackKeywordsMgr blackKeywordsMgr;
  33 +
30 @RequestMapping(value = "/spider/baike") 34 @RequestMapping(value = "/spider/baike")
31 @ResponseBody 35 @ResponseBody
32 public Map<String, Object> baikeSpider() { 36 public Map<String, Object> baikeSpider() {
33 try { 37 try {
34 - Map<String, Object> result = baikeSpiderService.start(); 38 + Map<String, Object> result = baikeURLSpiderService.start();
35 result.put("code", 200); 39 result.put("code", 200);
36 result.put("message", "success"); 40 result.put("message", "success");
37 return result; 41 return result;
@@ -94,4 +98,21 @@ public class BaikeSpiderController { @@ -94,4 +98,21 @@ public class BaikeSpiderController {
94 return result; 98 return result;
95 } 99 }
96 } 100 }
  101 +
  102 + @RequestMapping(value = "/spider/blackList")
  103 + @ResponseBody
  104 + public Map<String, Object> blackList(@RequestParam String keyword) {
  105 + Map<String, Object> result = new HashMap();
  106 +
  107 + try {
  108 + result.put("data", blackKeywordsMgr.getBlackKeywords());
  109 + result.put("code", 200);
  110 + result.put("message", "success");
  111 + return result;
  112 + } catch (Exception e) {
  113 + result.put("code", 503);
  114 + result.put("message", e.getMessage());
  115 + return result;
  116 + }
  117 + }
97 } 118 }
@@ -25,8 +25,6 @@ public class SuggestConvertorService { @@ -25,8 +25,6 @@ public class SuggestConvertorService {
25 25
26 private static final String ANALYZER = "ik_smart"; 26 private static final String ANALYZER = "ik_smart";
27 27
28 - private static final int MAX_KEYWORDS = 5;  
29 -  
30 @Autowired 28 @Autowired
31 private AnalyzerHelper analyzerHelper; 29 private AnalyzerHelper analyzerHelper;
32 30
@@ -66,6 +64,7 @@ public class SuggestConvertorService { @@ -66,6 +64,7 @@ public class SuggestConvertorService {
66 } 64 }
67 } 65 }
68 66
  67 + logger.info("[func=buildYohoKeywordBO][yohoKeywordsSize={}]", yohoKeywords.size());
69 return new YohoKeywordsBO(yohoKeywords, filterSortNameSet); 68 return new YohoKeywordsBO(yohoKeywords, filterSortNameSet);
70 } 69 }
71 70
@@ -133,17 +132,19 @@ public class SuggestConvertorService { @@ -133,17 +132,19 @@ public class SuggestConvertorService {
133 return null; 132 return null;
134 } 133 }
135 134
136 - logger.trace("[func=SuggestConversionFlow.parse][subject={}][matchResult={}]", spiderContent.getSubject(), matchResult); 135 + logger.trace("[func=SuggestConvertorService.parse][subject={}][matchResult={}]", spiderContent.getSubject(), matchResult);
137 List<String> sortedKeywords = com.yoho.search.base.utils.CollectionUtils.getSortedKeys(matchResult, false); 136 List<String> sortedKeywords = com.yoho.search.base.utils.CollectionUtils.getSortedKeys(matchResult, false);
138 137
139 // 如果可以 总是让品类名在第一个 138 // 如果可以 总是让品类名在第一个
140 String firstSortName = null; 139 String firstSortName = null;
141 for (String keyword : sortedKeywords) { 140 for (String keyword : sortedKeywords) {
142 - firstSortName = keyword;  
143 - break; 141 + if (yohoKeywordsBO.getSortNameMap().contains(keyword)) {
  142 + firstSortName = keyword;
  143 + break;
  144 + }
144 } 145 }
145 146
146 - List<String> resultKeywordList = new ArrayList<>(5); 147 + List<String> resultKeywordList = new ArrayList<>(SuggestionConstants.SMART_SUGGESTION_TERM_COUNT);
147 if (firstSortName != null) { 148 if (firstSortName != null) {
148 resultKeywordList.add(firstSortName); 149 resultKeywordList.add(firstSortName);
149 } 150 }
@@ -151,7 +152,7 @@ public class SuggestConvertorService { @@ -151,7 +152,7 @@ public class SuggestConvertorService {
151 for (String keyword : sortedKeywords) { 152 for (String keyword : sortedKeywords) {
152 if (keyword != null && !keyword.equalsIgnoreCase(firstSortName)) { 153 if (keyword != null && !keyword.equalsIgnoreCase(firstSortName)) {
153 resultKeywordList.add(keyword); 154 resultKeywordList.add(keyword);
154 - if (resultKeywordList.size() == MAX_KEYWORDS) { 155 + if (resultKeywordList.size() == SuggestionConstants.SMART_SUGGESTION_TERM_COUNT) {
155 break; 156 break;
156 } 157 }
157 } 158 }
1 -package com.yoho.search.spider.service; 1 +package com.yoho.search.spider.full;
2 2
3 import com.yoho.search.base.utils.ApplicationContextUtil; 3 import com.yoho.search.base.utils.ApplicationContextUtil;
4 import com.yoho.search.consumer.service.base.SpiderContentService; 4 import com.yoho.search.consumer.service.base.SpiderContentService;
5 import com.yoho.search.dal.model.SpiderContent; 5 import com.yoho.search.dal.model.SpiderContent;
  6 +import com.yoho.search.spider.common.BaikeBO;
6 import org.slf4j.Logger; 7 import org.slf4j.Logger;
7 import org.slf4j.LoggerFactory; 8 import org.slf4j.LoggerFactory;
8 9
1 -package com.yoho.search.spider.service; 1 +package com.yoho.search.spider.full;
2 2
3 import com.yoho.search.base.utils.FileUtils; 3 import com.yoho.search.base.utils.FileUtils;
  4 +import com.yoho.search.spider.common.BaikeBO;
  5 +import com.yoho.search.spider.common.SpiderBasedHttpRequest;
4 import org.apache.commons.collections.CollectionUtils; 6 import org.apache.commons.collections.CollectionUtils;
5 import org.slf4j.Logger; 7 import org.slf4j.Logger;
6 import org.slf4j.LoggerFactory; 8 import org.slf4j.LoggerFactory;
@@ -18,12 +20,12 @@ public class BaikeURLCrawler implements Callable<Integer> { @@ -18,12 +20,12 @@ public class BaikeURLCrawler implements Callable<Integer> {
18 20
19 private final Set<String> existSubjects; 21 private final Set<String> existSubjects;
20 22
21 - private final HttpRequestService httpRequestService; 23 + private final SpiderBasedHttpRequest spiderBasedHttpRequest;
22 24
23 - public BaikeURLCrawler(File file, Set<String> existSubjects, HttpRequestService httpRequestService) { 25 + public BaikeURLCrawler(File file, Set<String> existSubjects, SpiderBasedHttpRequest spiderBasedHttpRequest) {
24 this.file = file; 26 this.file = file;
25 this.existSubjects = existSubjects; 27 this.existSubjects = existSubjects;
26 - this.httpRequestService = httpRequestService; 28 + this.spiderBasedHttpRequest = spiderBasedHttpRequest;
27 } 29 }
28 30
29 @Override 31 @Override
@@ -52,7 +54,7 @@ public class BaikeURLCrawler implements Callable<Integer> { @@ -52,7 +54,7 @@ public class BaikeURLCrawler implements Callable<Integer> {
52 try { 54 try {
53 if (!this.existSubjects.contains(title.toLowerCase().trim())) { 55 if (!this.existSubjects.contains(title.toLowerCase().trim())) {
54 LOGGER.info("[func=BaikeURLCrawler][title={}][url={}]", title, url); 56 LOGGER.info("[func=BaikeURLCrawler][title={}][url={}]", title, url);
55 - BaikeBO baikeBO = httpRequestService.get(url); 57 + BaikeBO baikeBO = spiderBasedHttpRequest.get(url);
56 if (baikeBO != null) { 58 if (baikeBO != null) {
57 baikeBO.setTitle(title); 59 baikeBO.setTitle(title);
58 BaikeBOBulkService.submitBaike(baikeBO); 60 BaikeBOBulkService.submitBaike(baikeBO);
1 -package com.yoho.search.spider.service; 1 +package com.yoho.search.spider.full;
2 2
3 -import com.yoho.search.consumer.service.base.*; 3 +import com.yoho.search.consumer.service.base.SpiderContentService;
  4 +import com.yoho.search.spider.common.SpiderBasedHttpRequest;
4 import org.slf4j.Logger; 5 import org.slf4j.Logger;
5 import org.slf4j.LoggerFactory; 6 import org.slf4j.LoggerFactory;
6 import org.springframework.beans.factory.annotation.Autowired; 7 import org.springframework.beans.factory.annotation.Autowired;
@@ -18,7 +19,7 @@ import java.util.concurrent.atomic.AtomicInteger; @@ -18,7 +19,7 @@ import java.util.concurrent.atomic.AtomicInteger;
18 * Created by ginozhang on 2017/3/1. 19 * Created by ginozhang on 2017/3/1.
19 */ 20 */
20 @Component 21 @Component
21 -public class BaikeSpiderService { 22 +public class BaikeURLSpiderService {
22 23
23 private static final Logger LOGGER = LoggerFactory.getLogger("SEARCH_SPIDER"); 24 private static final Logger LOGGER = LoggerFactory.getLogger("SEARCH_SPIDER");
24 25
@@ -30,14 +31,14 @@ public class BaikeSpiderService { @@ -30,14 +31,14 @@ public class BaikeSpiderService {
30 private SpiderContentService spiderContentService; 31 private SpiderContentService spiderContentService;
31 32
32 @Autowired 33 @Autowired
33 - private HttpRequestService httpRequestService; 34 + private SpiderBasedHttpRequest spiderBasedHttpRequest;
34 35
35 public synchronized Map<String, Object> start() { 36 public synchronized Map<String, Object> start() {
36 long begin = System.currentTimeMillis(); 37 long begin = System.currentTimeMillis();
37 LOGGER.info("[func=BaikeSpiderService.start][begin={}]", begin); 38 LOGGER.info("[func=BaikeSpiderService.start][begin={}]", begin);
38 Map<String, Object> result = new HashMap(); 39 Map<String, Object> result = new HashMap();
39 40
40 - String baikeDir = BaikeSpiderService.class.getResource("/baike").getPath(); 41 + String baikeDir = BaikeURLSpiderService.class.getResource("/baike").getPath();
41 File baikeDirFile = new File(baikeDir); 42 File baikeDirFile = new File(baikeDir);
42 if (!baikeDirFile.exists() || !baikeDirFile.isDirectory()) { 43 if (!baikeDirFile.exists() || !baikeDirFile.isDirectory()) {
43 throw new RuntimeException("Cannot find baike directory. baikeDir=" + baikeDir); 44 throw new RuntimeException("Cannot find baike directory. baikeDir=" + baikeDir);
@@ -57,7 +58,7 @@ public class BaikeSpiderService { @@ -57,7 +58,7 @@ public class BaikeSpiderService {
57 ExecutorService pool = Executors.newFixedThreadPool(POOL_SIZE, thread -> new Thread(thread, "BaikeSpider-" + atomicInteger.getAndIncrement())); 58 ExecutorService pool = Executors.newFixedThreadPool(POOL_SIZE, thread -> new Thread(thread, "BaikeSpider-" + atomicInteger.getAndIncrement()));
58 List<Future<Integer>> futures = new ArrayList<>(baikeUrlFiles.length); 59 List<Future<Integer>> futures = new ArrayList<>(baikeUrlFiles.length);
59 for (File baikeUrlFile : baikeUrlFiles) { 60 for (File baikeUrlFile : baikeUrlFiles) {
60 - futures.add(pool.submit(new BaikeURLCrawler(baikeUrlFile, existSubjects, httpRequestService))); 61 + futures.add(pool.submit(new BaikeURLCrawler(baikeUrlFile, existSubjects, spiderBasedHttpRequest)));
61 } 62 }
62 63
63 for (Future<Integer> future : futures) { 64 for (Future<Integer> future : futures) {
  1 +package com.yoho.search.spider.increment;
  2 +
  3 +import com.yoho.core.redis.YHSetOperations;
  4 +import com.yoho.search.base.utils.RedisKeys;
  5 +import org.apache.commons.collections.CollectionUtils;
  6 +import org.springframework.stereotype.Component;
  7 +
  8 +import javax.annotation.Resource;
  9 +import java.util.List;
  10 +import java.util.Set;
  11 +
  12 +/**
  13 + * Created by ginozhang on 2017/3/3.
  14 + * 爬虫黑名单管理,超过指定次数失败的加入该黑名单。
  15 + */
  16 +@Component
  17 +public class BlackKeywordsMgr {
  18 +
  19 + @Resource(name = "yhNoSyncSetOperations")
  20 + private YHSetOperations<String, String> yhNoSyncSetOperations;
  21 +
  22 + public Set<String> getBlackKeywords() {
  23 + return yhNoSyncSetOperations.members(RedisKeys.YOHO_SEARCH_KEYWORDS_INVALID);
  24 + }
  25 +
  26 + public void addBlackKeywords(List<String> failedKeywords) {
  27 + if (CollectionUtils.isNotEmpty(failedKeywords)) {
  28 + String[] keywordArray = new String[failedKeywords.size()];
  29 + yhNoSyncSetOperations.add(RedisKeys.YOHO_SEARCH_KEYWORDS_INVALID, failedKeywords.toArray(keywordArray));
  30 + }
  31 + }
  32 +
  33 +}
1 -package com.yoho.search.spider.service; 1 +package com.yoho.search.spider.increment;
2 2
3 -import com.yoho.core.redis.YHSetOperations;  
4 import com.yoho.core.redis.YHZSetOperations; 3 import com.yoho.core.redis.YHZSetOperations;
5 import com.yoho.search.base.utils.RedisKeys; 4 import com.yoho.search.base.utils.RedisKeys;
6 import com.yoho.search.consumer.index.common.AnalyzerHelper; 5 import com.yoho.search.consumer.index.common.AnalyzerHelper;
@@ -8,6 +7,8 @@ import com.yoho.search.consumer.service.base.SpiderContentService; @@ -8,6 +7,8 @@ import com.yoho.search.consumer.service.base.SpiderContentService;
8 import com.yoho.search.consumer.suggests.common.RetryBusinessFlow; 7 import com.yoho.search.consumer.suggests.common.RetryBusinessFlow;
9 import com.yoho.search.consumer.suggests.common.SuggestionConstants; 8 import com.yoho.search.consumer.suggests.common.SuggestionConstants;
10 import com.yoho.search.dal.model.SpiderContent; 9 import com.yoho.search.dal.model.SpiderContent;
  10 +import com.yoho.search.spider.common.BaikeBO;
  11 +import com.yoho.search.spider.common.SpiderBasedHttpRequest;
11 import org.apache.commons.collections.CollectionUtils; 12 import org.apache.commons.collections.CollectionUtils;
12 import org.apache.commons.lang.StringUtils; 13 import org.apache.commons.lang.StringUtils;
13 import org.slf4j.Logger; 14 import org.slf4j.Logger;
@@ -30,7 +31,7 @@ import java.util.stream.Collectors; @@ -30,7 +31,7 @@ import java.util.stream.Collectors;
30 * Created by ginozhang on 2017/3/1. 31 * Created by ginozhang on 2017/3/1.
31 */ 32 */
32 @Component 33 @Component
33 -public class KeywordCrawlerFlow implements RetryBusinessFlow { 34 +public class IncrementCrawlerFlow implements RetryBusinessFlow {
34 35
35 private static final Logger logger = LoggerFactory.getLogger("FLOW_EXECUTOR"); 36 private static final Logger logger = LoggerFactory.getLogger("FLOW_EXECUTOR");
36 37
@@ -39,9 +40,6 @@ public class KeywordCrawlerFlow implements RetryBusinessFlow { @@ -39,9 +40,6 @@ public class KeywordCrawlerFlow implements RetryBusinessFlow {
39 @Resource(name = "yhNoSyncZSetOperations") 40 @Resource(name = "yhNoSyncZSetOperations")
40 private YHZSetOperations<String, String> yhNoSyncZSetOperations; 41 private YHZSetOperations<String, String> yhNoSyncZSetOperations;
41 42
42 - @Resource(name = "yhNoSyncSetOperations")  
43 - private YHSetOperations<String, String> yhNoSyncSetOperations;  
44 -  
45 @Autowired 43 @Autowired
46 private SpiderContentService spiderContentService; 44 private SpiderContentService spiderContentService;
47 45
@@ -49,7 +47,10 @@ public class KeywordCrawlerFlow implements RetryBusinessFlow { @@ -49,7 +47,10 @@ public class KeywordCrawlerFlow implements RetryBusinessFlow {
49 private AnalyzerHelper analyzerHelper; 47 private AnalyzerHelper analyzerHelper;
50 48
51 @Autowired 49 @Autowired
52 - private HttpRequestService httpRequestService; 50 + private SpiderBasedHttpRequest spiderBasedHttpRequest;
  51 +
  52 + @Autowired
  53 + private BlackKeywordsMgr blackKeywordsMgr;
53 54
54 private List<String> validKeywordList = null; 55 private List<String> validKeywordList = null;
55 56
@@ -71,24 +72,23 @@ public class KeywordCrawlerFlow implements RetryBusinessFlow { @@ -71,24 +72,23 @@ public class KeywordCrawlerFlow implements RetryBusinessFlow {
71 keywordSet.add(typedTuple.getValue()); 72 keywordSet.add(typedTuple.getValue());
72 } 73 }
73 74
74 - logger.info("[func=KeywordCrawlerFlow.init][keywordSetSize={}]", keywordSet.size()); 75 + logger.info("[func=IncrementCrawlerFlow.init][keywordSetSize={}]", keywordSet.size());
75 if (keywordSet.isEmpty()) { 76 if (keywordSet.isEmpty()) {
76 return; 77 return;
77 } 78 }
78 79
79 Set<String> existSubjects = spiderContentService.getAllSubjects(); 80 Set<String> existSubjects = spiderContentService.getAllSubjects();
80 - keywordSet.removeAll(existSubjects);  
81 - logger.info("[func=KeywordCrawlerFlow.init][keywordSetSizeRemovedExist={}]", keywordSet.size()); 81 + keywordSet = keywordSet.stream().filter(keyword -> !existSubjects.contains(keyword.toLowerCase().trim())).collect(Collectors.toSet());
  82 + logger.info("[func=IncrementCrawlerFlow.init][keywordSetSizeRemovedExist={}]", keywordSet.size());
82 83
83 - Set<String> invalidKeywords = yhNoSyncSetOperations.members(RedisKeys.YOHO_SEARCH_KEYWORDS_INVALID); 84 + Set<String> invalidKeywords = blackKeywordsMgr.getBlackKeywords();
84 if (CollectionUtils.isNotEmpty(invalidKeywords)) { 85 if (CollectionUtils.isNotEmpty(invalidKeywords)) {
85 keywordSet = keywordSet.stream().filter(keyword -> !invalidKeywords.contains(keyword.toLowerCase().trim())).collect(Collectors.toSet()); 86 keywordSet = keywordSet.stream().filter(keyword -> !invalidKeywords.contains(keyword.toLowerCase().trim())).collect(Collectors.toSet());
86 } 87 }
87 88
88 - logger.info("[func=KeywordCrawlerFlow.init][keywordSetSizeRemovedInvalid={}]", keywordSet.size());  
89 - 89 + logger.info("[func=IncrementCrawlerFlow.init][keywordSetSizeRemovedInvalid={}]", keywordSet.size());
90 this.validKeywordList = keywordSet.parallelStream().filter(keyword -> validKeyword(keyword)).collect(Collectors.toList()); 90 this.validKeywordList = keywordSet.parallelStream().filter(keyword -> validKeyword(keyword)).collect(Collectors.toList());
91 - logger.info("[func=KeywordCrawlerFlow.init][validKeywordListSize={}]", validKeywordList != null ? validKeywordList.size() : 0); 91 + logger.info("[func=IncrementCrawlerFlow.init][validKeywordListSize={}]", validKeywordList != null ? validKeywordList.size() : 0);
92 } 92 }
93 93
94 private boolean validKeyword(String keyword) { 94 private boolean validKeyword(String keyword) {
@@ -141,25 +141,21 @@ public class KeywordCrawlerFlow implements RetryBusinessFlow { @@ -141,25 +141,21 @@ public class KeywordCrawlerFlow implements RetryBusinessFlow {
141 } 141 }
142 } 142 }
143 143
144 - logger.info("[func=KeywordCrawlerFlow.doBusiness][baikeBOListSize={}][failedKeywords={}]", baikeBOList.size(), failedKeywords); 144 + logger.info("[func=IncrementCrawlerFlow.doBusiness][baikeBOListSize={}][failedKeywords={}]", baikeBOList.size(), failedKeywords);
145 if (CollectionUtils.isNotEmpty(baikeBOList)) { 145 if (CollectionUtils.isNotEmpty(baikeBOList)) {
146 List<SpiderContent> spiderContentList = baikeBOList.stream().map(baikeBO -> baikeBO.toSpiderContent()).collect(Collectors.toList()); 146 List<SpiderContent> spiderContentList = baikeBOList.stream().map(baikeBO -> baikeBO.toSpiderContent()).collect(Collectors.toList());
147 spiderContentService.insertBatch(spiderContentList); 147 spiderContentService.insertBatch(spiderContentList);
148 } 148 }
149 149
150 - if (!failedKeywords.isEmpty()) {  
151 - String[] keywordArray = new String[failedKeywords.size()];  
152 - yhNoSyncSetOperations.add(RedisKeys.YOHO_SEARCH_KEYWORDS_INVALID, failedKeywords.toArray(keywordArray));  
153 - }  
154 - 150 + blackKeywordsMgr.addBlackKeywords(failedKeywords);
155 return true; 151 return true;
156 } 152 }
157 153
158 private BaikeBO crawle(String keyword) { 154 private BaikeBO crawle(String keyword) {
159 try { 155 try {
160 String url = "http://baike.baidu.com/item/" + URLEncoder.encode(keyword, "UTF-8"); 156 String url = "http://baike.baidu.com/item/" + URLEncoder.encode(keyword, "UTF-8");
161 - logger.info("[func=KeywordCrawlerFlow][keyword={}][url={}]", keyword, url);  
162 - return httpRequestService.get(url); 157 + logger.info("[func=IncrementCrawlerFlow][keyword={}][url={}]", keyword, url);
  158 + return spiderBasedHttpRequest.get(url);
163 } catch (Exception e) { 159 } catch (Exception e) {
164 logger.error("crawle keyword [" + keyword + "] failed!", e); 160 logger.error("crawle keyword [" + keyword + "] failed!", e);
165 } 161 }
@@ -170,13 +166,13 @@ public class KeywordCrawlerFlow implements RetryBusinessFlow { @@ -170,13 +166,13 @@ public class KeywordCrawlerFlow implements RetryBusinessFlow {
170 @Override 166 @Override
171 public void finish(boolean doBusinessResult, Exception exception) { 167 public void finish(boolean doBusinessResult, Exception exception) {
172 this.validKeywordList = null; 168 this.validKeywordList = null;
173 - logger.info("[func=KeywordCrawlerFlow.finish][doBusinessResult=" + doBusinessResult + "]", exception); 169 + logger.info("[func=IncrementCrawlerFlow.finish][doBusinessResult=" + doBusinessResult + "]", exception);
174 } 170 }
175 171
176 public static void main(String[] args) throws UnsupportedEncodingException { 172 public static void main(String[] args) throws UnsupportedEncodingException {
177 final String keyword = "华伦天奴"; 173 final String keyword = "华伦天奴";
178 String url = "http://baike.baidu.com/item/" + URLEncoder.encode(keyword, "UTF-8"); 174 String url = "http://baike.baidu.com/item/" + URLEncoder.encode(keyword, "UTF-8");
179 - logger.info("[func=KeywordCrawlerFlow][keyword={}][url={}]", keyword, url);  
180 - System.out.println((new HttpRequestService()).get(url)); 175 + logger.info("[func=IncrementCrawlerFlow][keyword={}][url={}]", keyword, url);
  176 + System.out.println((new SpiderBasedHttpRequest()).get(url));
181 } 177 }
182 } 178 }
@@ -3,7 +3,7 @@ package com.yoho.search.spider.job; @@ -3,7 +3,7 @@ package com.yoho.search.spider.job;
3 import com.yoho.search.consumer.common.DynamicConfigService; 3 import com.yoho.search.consumer.common.DynamicConfigService;
4 import com.yoho.search.consumer.suggests.common.RetryBusinessFlowExecutor; 4 import com.yoho.search.consumer.suggests.common.RetryBusinessFlowExecutor;
5 import com.yoho.search.spider.conversation.SuggestConvertorFlow; 5 import com.yoho.search.spider.conversation.SuggestConvertorFlow;
6 -import com.yoho.search.spider.service.KeywordCrawlerFlow; 6 +import com.yoho.search.spider.increment.IncrementCrawlerFlow;
7 import org.slf4j.Logger; 7 import org.slf4j.Logger;
8 import org.slf4j.LoggerFactory; 8 import org.slf4j.LoggerFactory;
9 import org.springframework.beans.factory.annotation.Autowired; 9 import org.springframework.beans.factory.annotation.Autowired;
@@ -19,7 +19,7 @@ public class SpiderJob { @@ -19,7 +19,7 @@ public class SpiderJob {
19 private static final Logger LOGGER = LoggerFactory.getLogger("SEARCH_SPIDER"); 19 private static final Logger LOGGER = LoggerFactory.getLogger("SEARCH_SPIDER");
20 20
21 @Autowired 21 @Autowired
22 - private KeywordCrawlerFlow keywordCrawlerFlow; 22 + private IncrementCrawlerFlow incrementCrawlerFlow;
23 23
24 @Autowired 24 @Autowired
25 private SuggestConvertorFlow suggestConvertorFlow; 25 private SuggestConvertorFlow suggestConvertorFlow;
@@ -37,7 +37,7 @@ public class SpiderJob { @@ -37,7 +37,7 @@ public class SpiderJob {
37 return; 37 return;
38 } 38 }
39 39
40 - RetryBusinessFlowExecutor flowExecutor = new RetryBusinessFlowExecutor(keywordCrawlerFlow); 40 + RetryBusinessFlowExecutor flowExecutor = new RetryBusinessFlowExecutor(incrementCrawlerFlow);
41 boolean result = flowExecutor.execute(); 41 boolean result = flowExecutor.execute();
42 LOGGER.info("[func=crawleEmptySearchKeywords.end][result={}][cost={}]", result, System.currentTimeMillis() - begin); 42 LOGGER.info("[func=crawleEmptySearchKeywords.end][result={}][cost={}]", result, System.currentTimeMillis() - begin);
43 } 43 }