|
|
package com.yoho.search.spider.service;
|
|
|
package com.yoho.search.spider.increment;
|
|
|
|
|
|
import com.yoho.core.redis.YHSetOperations;
|
|
|
import com.yoho.core.redis.YHZSetOperations;
|
|
|
import com.yoho.search.base.utils.RedisKeys;
|
|
|
import com.yoho.search.consumer.index.common.AnalyzerHelper;
|
...
|
...
|
@@ -8,6 +7,8 @@ import com.yoho.search.consumer.service.base.SpiderContentService; |
|
|
import com.yoho.search.consumer.suggests.common.RetryBusinessFlow;
|
|
|
import com.yoho.search.consumer.suggests.common.SuggestionConstants;
|
|
|
import com.yoho.search.dal.model.SpiderContent;
|
|
|
import com.yoho.search.spider.common.BaikeBO;
|
|
|
import com.yoho.search.spider.common.SpiderBasedHttpRequest;
|
|
|
import org.apache.commons.collections.CollectionUtils;
|
|
|
import org.apache.commons.lang.StringUtils;
|
|
|
import org.slf4j.Logger;
|
...
|
...
|
@@ -30,7 +31,7 @@ import java.util.stream.Collectors; |
|
|
* Created by ginozhang on 2017/3/1.
|
|
|
*/
|
|
|
@Component
|
|
|
public class KeywordCrawlerFlow implements RetryBusinessFlow {
|
|
|
public class IncrementCrawlerFlow implements RetryBusinessFlow {
|
|
|
|
|
|
private static final Logger logger = LoggerFactory.getLogger("FLOW_EXECUTOR");
|
|
|
|
...
|
...
|
@@ -39,9 +40,6 @@ public class KeywordCrawlerFlow implements RetryBusinessFlow { |
|
|
@Resource(name = "yhNoSyncZSetOperations")
|
|
|
private YHZSetOperations<String, String> yhNoSyncZSetOperations;
|
|
|
|
|
|
@Resource(name = "yhNoSyncSetOperations")
|
|
|
private YHSetOperations<String, String> yhNoSyncSetOperations;
|
|
|
|
|
|
@Autowired
|
|
|
private SpiderContentService spiderContentService;
|
|
|
|
...
|
...
|
@@ -49,7 +47,10 @@ public class KeywordCrawlerFlow implements RetryBusinessFlow { |
|
|
private AnalyzerHelper analyzerHelper;
|
|
|
|
|
|
@Autowired
|
|
|
private HttpRequestService httpRequestService;
|
|
|
private SpiderBasedHttpRequest spiderBasedHttpRequest;
|
|
|
|
|
|
@Autowired
|
|
|
private BlackKeywordsMgr blackKeywordsMgr;
|
|
|
|
|
|
private List<String> validKeywordList = null;
|
|
|
|
...
|
...
|
@@ -71,24 +72,23 @@ public class KeywordCrawlerFlow implements RetryBusinessFlow { |
|
|
keywordSet.add(typedTuple.getValue());
|
|
|
}
|
|
|
|
|
|
logger.info("[func=KeywordCrawlerFlow.init][keywordSetSize={}]", keywordSet.size());
|
|
|
logger.info("[func=IncrementCrawlerFlow.init][keywordSetSize={}]", keywordSet.size());
|
|
|
if (keywordSet.isEmpty()) {
|
|
|
return;
|
|
|
}
|
|
|
|
|
|
Set<String> existSubjects = spiderContentService.getAllSubjects();
|
|
|
keywordSet.removeAll(existSubjects);
|
|
|
logger.info("[func=KeywordCrawlerFlow.init][keywordSetSizeRemovedExist={}]", keywordSet.size());
|
|
|
keywordSet = keywordSet.stream().filter(keyword -> !existSubjects.contains(keyword.toLowerCase().trim())).collect(Collectors.toSet());
|
|
|
logger.info("[func=IncrementCrawlerFlow.init][keywordSetSizeRemovedExist={}]", keywordSet.size());
|
|
|
|
|
|
Set<String> invalidKeywords = yhNoSyncSetOperations.members(RedisKeys.YOHO_SEARCH_KEYWORDS_INVALID);
|
|
|
Set<String> invalidKeywords = blackKeywordsMgr.getBlackKeywords();
|
|
|
if (CollectionUtils.isNotEmpty(invalidKeywords)) {
|
|
|
keywordSet = keywordSet.stream().filter(keyword -> !invalidKeywords.contains(keyword.toLowerCase().trim())).collect(Collectors.toSet());
|
|
|
}
|
|
|
|
|
|
logger.info("[func=KeywordCrawlerFlow.init][keywordSetSizeRemovedInvalid={}]", keywordSet.size());
|
|
|
|
|
|
logger.info("[func=IncrementCrawlerFlow.init][keywordSetSizeRemovedInvalid={}]", keywordSet.size());
|
|
|
this.validKeywordList = keywordSet.parallelStream().filter(keyword -> validKeyword(keyword)).collect(Collectors.toList());
|
|
|
logger.info("[func=KeywordCrawlerFlow.init][validKeywordListSize={}]", validKeywordList != null ? validKeywordList.size() : 0);
|
|
|
logger.info("[func=IncrementCrawlerFlow.init][validKeywordListSize={}]", validKeywordList != null ? validKeywordList.size() : 0);
|
|
|
}
|
|
|
|
|
|
private boolean validKeyword(String keyword) {
|
...
|
...
|
@@ -141,25 +141,21 @@ public class KeywordCrawlerFlow implements RetryBusinessFlow { |
|
|
}
|
|
|
}
|
|
|
|
|
|
logger.info("[func=KeywordCrawlerFlow.doBusiness][baikeBOListSize={}][failedKeywords={}]", baikeBOList.size(), failedKeywords);
|
|
|
logger.info("[func=IncrementCrawlerFlow.doBusiness][baikeBOListSize={}][failedKeywords={}]", baikeBOList.size(), failedKeywords);
|
|
|
if (CollectionUtils.isNotEmpty(baikeBOList)) {
|
|
|
List<SpiderContent> spiderContentList = baikeBOList.stream().map(baikeBO -> baikeBO.toSpiderContent()).collect(Collectors.toList());
|
|
|
spiderContentService.insertBatch(spiderContentList);
|
|
|
}
|
|
|
|
|
|
if (!failedKeywords.isEmpty()) {
|
|
|
String[] keywordArray = new String[failedKeywords.size()];
|
|
|
yhNoSyncSetOperations.add(RedisKeys.YOHO_SEARCH_KEYWORDS_INVALID, failedKeywords.toArray(keywordArray));
|
|
|
}
|
|
|
|
|
|
blackKeywordsMgr.addBlackKeywords(failedKeywords);
|
|
|
return true;
|
|
|
}
|
|
|
|
|
|
private BaikeBO crawle(String keyword) {
|
|
|
try {
|
|
|
String url = "http://baike.baidu.com/item/" + URLEncoder.encode(keyword, "UTF-8");
|
|
|
logger.info("[func=KeywordCrawlerFlow][keyword={}][url={}]", keyword, url);
|
|
|
return httpRequestService.get(url);
|
|
|
logger.info("[func=IncrementCrawlerFlow][keyword={}][url={}]", keyword, url);
|
|
|
return spiderBasedHttpRequest.get(url);
|
|
|
} catch (Exception e) {
|
|
|
logger.error("crawle keyword [" + keyword + "] failed!", e);
|
|
|
}
|
...
|
...
|
@@ -170,13 +166,13 @@ public class KeywordCrawlerFlow implements RetryBusinessFlow { |
|
|
@Override
|
|
|
public void finish(boolean doBusinessResult, Exception exception) {
|
|
|
this.validKeywordList = null;
|
|
|
logger.info("[func=KeywordCrawlerFlow.finish][doBusinessResult=" + doBusinessResult + "]", exception);
|
|
|
logger.info("[func=IncrementCrawlerFlow.finish][doBusinessResult=" + doBusinessResult + "]", exception);
|
|
|
}
|
|
|
|
|
|
public static void main(String[] args) throws UnsupportedEncodingException {
|
|
|
final String keyword = "华伦天奴";
|
|
|
String url = "http://baike.baidu.com/item/" + URLEncoder.encode(keyword, "UTF-8");
|
|
|
logger.info("[func=KeywordCrawlerFlow][keyword={}][url={}]", keyword, url);
|
|
|
System.out.println((new HttpRequestService()).get(url));
|
|
|
logger.info("[func=IncrementCrawlerFlow][keyword={}][url={}]", keyword, url);
|
|
|
System.out.println((new SpiderBasedHttpRequest()).get(url));
|
|
|
}
|
|
|
} |
...
|
...
|
|