...
|
...
|
@@ -35,6 +35,8 @@ public class IncrementCrawlerFlow implements RetryBusinessFlow { |
|
|
|
|
|
private static final Logger logger = LoggerFactory.getLogger("FLOW_EXECUTOR");
|
|
|
|
|
|
private static final Logger REPORT_LOGGER = LoggerFactory.getLogger("CONSUMER_REPORTER");
|
|
|
|
|
|
private static final int KEYWORD_COUNT = 100;
|
|
|
|
|
|
@Resource(name = "yhNoSyncZSetOperations")
|
...
|
...
|
@@ -54,6 +56,10 @@ public class IncrementCrawlerFlow implements RetryBusinessFlow { |
|
|
|
|
|
private List<String> validKeywordList = null;
|
|
|
|
|
|
private volatile List<String> succeedKeywords = new ArrayList<>(100);
|
|
|
|
|
|
private volatile List<String> failedKeywords = new ArrayList<>(100);
|
|
|
|
|
|
@Override
|
|
|
public String flowName() {
|
|
|
return this.getClass().getSimpleName();
|
...
|
...
|
@@ -61,17 +67,26 @@ public class IncrementCrawlerFlow implements RetryBusinessFlow { |
|
|
|
|
|
@Override
|
|
|
public void init() {
|
|
|
Set<String> keywordSet = new HashSet<>(1000);
|
|
|
succeedKeywords.clear();
|
|
|
failedKeywords.clear();
|
|
|
|
|
|
Set<String> keywordSet = new HashSet<>(200);
|
|
|
Set<String> topEmptySeachKeywords = new HashSet<>(100);
|
|
|
Set<String> topLessSeachKeywords = new HashSet<>(100);
|
|
|
Set<ZSetOperations.TypedTuple<String>> redisResults = yhNoSyncZSetOperations.reverseRangeWithScores(RedisKeys.getRedisKey4Yesterday(RedisKeys.YOHO_SEARCH_KEYWORDS_EMPTY), 0, KEYWORD_COUNT);
|
|
|
for (ZSetOperations.TypedTuple<String> typedTuple : redisResults) {
|
|
|
keywordSet.add(typedTuple.getValue());
|
|
|
topEmptySeachKeywords.add(typedTuple.getValue());
|
|
|
}
|
|
|
|
|
|
redisResults = yhNoSyncZSetOperations.reverseRangeWithScores(RedisKeys.getRedisKey4Yesterday(RedisKeys.YOHO_SEARCH_KEYWORDS_LESS), 0, KEYWORD_COUNT);
|
|
|
for (ZSetOperations.TypedTuple<String> typedTuple : redisResults) {
|
|
|
keywordSet.add(typedTuple.getValue());
|
|
|
topLessSeachKeywords.add(typedTuple.getValue());
|
|
|
}
|
|
|
|
|
|
REPORT_LOGGER.info("[key=TopEmptySeachKeywords][topEmptySeachKeywords={}]", topEmptySeachKeywords);
|
|
|
REPORT_LOGGER.info("[key=TopLessSeachKeywords][topLessSeachKeywords={}]", topLessSeachKeywords);
|
|
|
keywordSet.addAll(topEmptySeachKeywords);
|
|
|
keywordSet.addAll(topLessSeachKeywords);
|
|
|
logger.info("[func=IncrementCrawlerFlow.init][keywordSetSize={}]", keywordSet.size());
|
|
|
if (keywordSet.isEmpty()) {
|
|
|
return;
|
...
|
...
|
@@ -89,6 +104,7 @@ public class IncrementCrawlerFlow implements RetryBusinessFlow { |
|
|
logger.info("[func=IncrementCrawlerFlow.init][keywordSetSizeRemovedInvalid={}]", keywordSet.size());
|
|
|
this.validKeywordList = keywordSet.parallelStream().filter(keyword -> validKeyword(keyword)).collect(Collectors.toList());
|
|
|
logger.info("[func=IncrementCrawlerFlow.init][validKeywordListSize={}]", validKeywordList != null ? validKeywordList.size() : 0);
|
|
|
REPORT_LOGGER.info("[key=ValidKeywordList][validIncrementKeywords={}]", validKeywordList);
|
|
|
}
|
|
|
|
|
|
private boolean validKeyword(String keyword) {
|
...
|
...
|
@@ -129,25 +145,41 @@ public class IncrementCrawlerFlow implements RetryBusinessFlow { |
|
|
return true;
|
|
|
}
|
|
|
|
|
|
List<String> failedKeywords = new ArrayList<>();
|
|
|
List<String> tempFailedKeywords = new ArrayList<>();
|
|
|
List<String> tempSucceedKeywords = new ArrayList<>();
|
|
|
|
|
|
List<BaikeBO> baikeBOList = new ArrayList<>();
|
|
|
BaikeBO tempBaikeBO;
|
|
|
for (String keyword : subListKeywords) {
|
|
|
if ((tempBaikeBO = incrementCrawlerService.doCrawle(keyword)) != null) {
|
|
|
tempBaikeBO.setTitle(keyword);
|
|
|
baikeBOList.add(tempBaikeBO);
|
|
|
tempSucceedKeywords.add(keyword.toLowerCase().trim());
|
|
|
} else {
|
|
|
failedKeywords.add(keyword.toLowerCase().trim());
|
|
|
tempFailedKeywords.add(keyword.toLowerCase().trim());
|
|
|
}
|
|
|
}
|
|
|
|
|
|
logger.info("[func=IncrementCrawlerFlow.doBusiness][baikeBOListSize={}][failedKeywords={}]", baikeBOList.size(), failedKeywords);
|
|
|
logger.info("[func=IncrementCrawlerFlow.doBusiness][baikeBOListSize={}][failedKeywords={}]", baikeBOList.size(), tempFailedKeywords);
|
|
|
if (CollectionUtils.isNotEmpty(baikeBOList)) {
|
|
|
List<SpiderContent> spiderContentList = baikeBOList.stream().map(baikeBO -> baikeBO.toSpiderContent()).collect(Collectors.toList());
|
|
|
spiderContentService.insertBatch(spiderContentList);
|
|
|
}
|
|
|
|
|
|
blackKeywordsMgr.addBlackKeywords(failedKeywords);
|
|
|
blackKeywordsMgr.addBlackKeywords(tempFailedKeywords);
|
|
|
|
|
|
// 用于输出统计日志
|
|
|
if (!tempSucceedKeywords.isEmpty()) {
|
|
|
synchronized (this) {
|
|
|
this.succeedKeywords.addAll(tempSucceedKeywords);
|
|
|
}
|
|
|
}
|
|
|
if (!tempFailedKeywords.isEmpty()) {
|
|
|
synchronized (this) {
|
|
|
this.failedKeywords.addAll(tempFailedKeywords);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
return true;
|
|
|
}
|
|
|
|
...
|
...
|
@@ -155,6 +187,11 @@ public class IncrementCrawlerFlow implements RetryBusinessFlow { |
|
|
public void finish(boolean doBusinessResult, Exception exception) {
|
|
|
this.validKeywordList = null;
|
|
|
logger.info("[func=IncrementCrawlerFlow.finish][doBusinessResult=" + doBusinessResult + "]", exception);
|
|
|
|
|
|
REPORT_LOGGER.info("[key=SucceedIncrementKeywords][succeedIncrementKeywords={}]", succeedKeywords);
|
|
|
REPORT_LOGGER.info("[key=FailedIncrementKeywords][failedIncrementKeywords={}]", failedKeywords);
|
|
|
succeedKeywords.clear();
|
|
|
failedKeywords.clear();
|
|
|
}
|
|
|
|
|
|
public static void main(String[] args) throws UnsupportedEncodingException {
|
...
|
...
|
|