Authored by Gino Zhang

优化代码结构

... ... @@ -18,6 +18,11 @@ public class SuggestionConstants {
public static final List<String> IGNORE_KEYWORDS = Arrays.asList("其他","正常","中文","中国","普通","2%");
/**
* 返回智能搜索词的数量
*/
public static final int SMART_SUGGESTION_TERM_COUNT = 5;
public static String YOHO_SEARCH_KEYWORDS_HOT = "YOHO.SEARCH.KEYWORDS.HOT";
... ...
package com.yoho.search.spider.service;
package com.yoho.search.spider.common;
import com.yoho.search.base.utils.DateUtil;
import com.yoho.search.dal.model.SpiderContent;
... ...
package com.yoho.search.spider.service;
package com.yoho.search.spider.common;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
... ... @@ -16,7 +16,7 @@ import java.net.URLConnection;
* Created by ginozhang on 2017/2/28.
*/
@Component
public class HttpRequestService {
public class SpiderBasedHttpRequest {
private static final Logger LOGGER = LoggerFactory.getLogger("SEARCH_SPIDER");
... ... @@ -120,7 +120,7 @@ public class HttpRequestService {
}
public static void main(String[] args) throws XPatherException {
HttpRequestService service = new HttpRequestService();
SpiderBasedHttpRequest service = new SpiderBasedHttpRequest();
String url = "http://baike.baidu.com/item/vans";
BaikeBO baikeBO = service.get(url);
System.out.println(baikeBO);
... ...
... ... @@ -2,7 +2,8 @@ package com.yoho.search.spider.controller;
import com.yoho.search.spider.conversation.SuggestConvertorService;
import com.yoho.search.spider.job.SpiderJob;
import com.yoho.search.spider.service.BaikeSpiderService;
import com.yoho.search.spider.full.BaikeURLSpiderService;
import com.yoho.search.spider.increment.BlackKeywordsMgr;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.RequestMapping;
... ... @@ -19,7 +20,7 @@ import java.util.Map;
public class BaikeSpiderController {
@Autowired
private BaikeSpiderService baikeSpiderService;
private BaikeURLSpiderService baikeURLSpiderService;
@Autowired
private SpiderJob spiderJob;
... ... @@ -27,11 +28,14 @@ public class BaikeSpiderController {
@Autowired
private SuggestConvertorService suggestConvertorService;
@Autowired
private BlackKeywordsMgr blackKeywordsMgr;
@RequestMapping(value = "/spider/baike")
@ResponseBody
public Map<String, Object> baikeSpider() {
try {
Map<String, Object> result = baikeSpiderService.start();
Map<String, Object> result = baikeURLSpiderService.start();
result.put("code", 200);
result.put("message", "success");
return result;
... ... @@ -94,4 +98,21 @@ public class BaikeSpiderController {
return result;
}
}
@RequestMapping(value = "/spider/blackList")
@ResponseBody
public Map<String, Object> blackList(@RequestParam String keyword) {
Map<String, Object> result = new HashMap();
try {
result.put("data", blackKeywordsMgr.getBlackKeywords());
result.put("code", 200);
result.put("message", "success");
return result;
} catch (Exception e) {
result.put("code", 503);
result.put("message", e.getMessage());
return result;
}
}
}
... ...
... ... @@ -25,8 +25,6 @@ public class SuggestConvertorService {
private static final String ANALYZER = "ik_smart";
private static final int MAX_KEYWORDS = 5;
@Autowired
private AnalyzerHelper analyzerHelper;
... ... @@ -66,6 +64,7 @@ public class SuggestConvertorService {
}
}
logger.info("[func=buildYohoKeywordBO][yohoKeywordsSize={}]", yohoKeywords.size());
return new YohoKeywordsBO(yohoKeywords, filterSortNameSet);
}
... ... @@ -133,17 +132,19 @@ public class SuggestConvertorService {
return null;
}
logger.trace("[func=SuggestConversionFlow.parse][subject={}][matchResult={}]", spiderContent.getSubject(), matchResult);
logger.trace("[func=SuggestConvertorService.parse][subject={}][matchResult={}]", spiderContent.getSubject(), matchResult);
List<String> sortedKeywords = com.yoho.search.base.utils.CollectionUtils.getSortedKeys(matchResult, false);
// 如果可以 总是让品类名在第一个
String firstSortName = null;
for (String keyword : sortedKeywords) {
if (yohoKeywordsBO.getSortNameMap().contains(keyword)) {
firstSortName = keyword;
break;
}
}
List<String> resultKeywordList = new ArrayList<>(5);
List<String> resultKeywordList = new ArrayList<>(SuggestionConstants.SMART_SUGGESTION_TERM_COUNT);
if (firstSortName != null) {
resultKeywordList.add(firstSortName);
}
... ... @@ -151,7 +152,7 @@ public class SuggestConvertorService {
for (String keyword : sortedKeywords) {
if (keyword != null && !keyword.equalsIgnoreCase(firstSortName)) {
resultKeywordList.add(keyword);
if (resultKeywordList.size() == MAX_KEYWORDS) {
if (resultKeywordList.size() == SuggestionConstants.SMART_SUGGESTION_TERM_COUNT) {
break;
}
}
... ...
package com.yoho.search.spider.service;
package com.yoho.search.spider.full;
import com.yoho.search.base.utils.ApplicationContextUtil;
import com.yoho.search.consumer.service.base.SpiderContentService;
import com.yoho.search.dal.model.SpiderContent;
import com.yoho.search.spider.common.BaikeBO;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
... ...
package com.yoho.search.spider.service;
package com.yoho.search.spider.full;
import com.yoho.search.base.utils.FileUtils;
import com.yoho.search.spider.common.BaikeBO;
import com.yoho.search.spider.common.SpiderBasedHttpRequest;
import org.apache.commons.collections.CollectionUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
... ... @@ -18,12 +20,12 @@ public class BaikeURLCrawler implements Callable<Integer> {
private final Set<String> existSubjects;
private final HttpRequestService httpRequestService;
private final SpiderBasedHttpRequest spiderBasedHttpRequest;
public BaikeURLCrawler(File file, Set<String> existSubjects, HttpRequestService httpRequestService) {
public BaikeURLCrawler(File file, Set<String> existSubjects, SpiderBasedHttpRequest spiderBasedHttpRequest) {
this.file = file;
this.existSubjects = existSubjects;
this.httpRequestService = httpRequestService;
this.spiderBasedHttpRequest = spiderBasedHttpRequest;
}
@Override
... ... @@ -52,7 +54,7 @@ public class BaikeURLCrawler implements Callable<Integer> {
try {
if (!this.existSubjects.contains(title.toLowerCase().trim())) {
LOGGER.info("[func=BaikeURLCrawler][title={}][url={}]", title, url);
BaikeBO baikeBO = httpRequestService.get(url);
BaikeBO baikeBO = spiderBasedHttpRequest.get(url);
if (baikeBO != null) {
baikeBO.setTitle(title);
BaikeBOBulkService.submitBaike(baikeBO);
... ...
package com.yoho.search.spider.service;
package com.yoho.search.spider.full;
import com.yoho.search.consumer.service.base.*;
import com.yoho.search.consumer.service.base.SpiderContentService;
import com.yoho.search.spider.common.SpiderBasedHttpRequest;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
... ... @@ -18,7 +19,7 @@ import java.util.concurrent.atomic.AtomicInteger;
* Created by ginozhang on 2017/3/1.
*/
@Component
public class BaikeSpiderService {
public class BaikeURLSpiderService {
private static final Logger LOGGER = LoggerFactory.getLogger("SEARCH_SPIDER");
... ... @@ -30,14 +31,14 @@ public class BaikeSpiderService {
private SpiderContentService spiderContentService;
@Autowired
private HttpRequestService httpRequestService;
private SpiderBasedHttpRequest spiderBasedHttpRequest;
public synchronized Map<String, Object> start() {
long begin = System.currentTimeMillis();
LOGGER.info("[func=BaikeSpiderService.start][begin={}]", begin);
Map<String, Object> result = new HashMap();
String baikeDir = BaikeSpiderService.class.getResource("/baike").getPath();
String baikeDir = BaikeURLSpiderService.class.getResource("/baike").getPath();
File baikeDirFile = new File(baikeDir);
if (!baikeDirFile.exists() || !baikeDirFile.isDirectory()) {
throw new RuntimeException("Cannot find baike directory. baikeDir=" + baikeDir);
... ... @@ -57,7 +58,7 @@ public class BaikeSpiderService {
ExecutorService pool = Executors.newFixedThreadPool(POOL_SIZE, thread -> new Thread(thread, "BaikeSpider-" + atomicInteger.getAndIncrement()));
List<Future<Integer>> futures = new ArrayList<>(baikeUrlFiles.length);
for (File baikeUrlFile : baikeUrlFiles) {
futures.add(pool.submit(new BaikeURLCrawler(baikeUrlFile, existSubjects, httpRequestService)));
futures.add(pool.submit(new BaikeURLCrawler(baikeUrlFile, existSubjects, spiderBasedHttpRequest)));
}
for (Future<Integer> future : futures) {
... ...
package com.yoho.search.spider.increment;
import com.yoho.core.redis.YHSetOperations;
import com.yoho.search.base.utils.RedisKeys;
import org.apache.commons.collections.CollectionUtils;
import org.springframework.stereotype.Component;
import javax.annotation.Resource;
import java.util.List;
import java.util.Set;
/**
* Created by ginozhang on 2017/3/3.
* 爬虫黑名单管理,超过指定次数失败的加入该黑名单。
*/
@Component
public class BlackKeywordsMgr {
@Resource(name = "yhNoSyncSetOperations")
private YHSetOperations<String, String> yhNoSyncSetOperations;
public Set<String> getBlackKeywords() {
return yhNoSyncSetOperations.members(RedisKeys.YOHO_SEARCH_KEYWORDS_INVALID);
}
public void addBlackKeywords(List<String> failedKeywords) {
if (CollectionUtils.isNotEmpty(failedKeywords)) {
String[] keywordArray = new String[failedKeywords.size()];
yhNoSyncSetOperations.add(RedisKeys.YOHO_SEARCH_KEYWORDS_INVALID, failedKeywords.toArray(keywordArray));
}
}
}
... ...
package com.yoho.search.spider.service;
package com.yoho.search.spider.increment;
import com.yoho.core.redis.YHSetOperations;
import com.yoho.core.redis.YHZSetOperations;
import com.yoho.search.base.utils.RedisKeys;
import com.yoho.search.consumer.index.common.AnalyzerHelper;
... ... @@ -8,6 +7,8 @@ import com.yoho.search.consumer.service.base.SpiderContentService;
import com.yoho.search.consumer.suggests.common.RetryBusinessFlow;
import com.yoho.search.consumer.suggests.common.SuggestionConstants;
import com.yoho.search.dal.model.SpiderContent;
import com.yoho.search.spider.common.BaikeBO;
import com.yoho.search.spider.common.SpiderBasedHttpRequest;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
... ... @@ -30,7 +31,7 @@ import java.util.stream.Collectors;
* Created by ginozhang on 2017/3/1.
*/
@Component
public class KeywordCrawlerFlow implements RetryBusinessFlow {
public class IncrementCrawlerFlow implements RetryBusinessFlow {
private static final Logger logger = LoggerFactory.getLogger("FLOW_EXECUTOR");
... ... @@ -39,9 +40,6 @@ public class KeywordCrawlerFlow implements RetryBusinessFlow {
@Resource(name = "yhNoSyncZSetOperations")
private YHZSetOperations<String, String> yhNoSyncZSetOperations;
@Resource(name = "yhNoSyncSetOperations")
private YHSetOperations<String, String> yhNoSyncSetOperations;
@Autowired
private SpiderContentService spiderContentService;
... ... @@ -49,7 +47,10 @@ public class KeywordCrawlerFlow implements RetryBusinessFlow {
private AnalyzerHelper analyzerHelper;
@Autowired
private HttpRequestService httpRequestService;
private SpiderBasedHttpRequest spiderBasedHttpRequest;
@Autowired
private BlackKeywordsMgr blackKeywordsMgr;
private List<String> validKeywordList = null;
... ... @@ -71,24 +72,23 @@ public class KeywordCrawlerFlow implements RetryBusinessFlow {
keywordSet.add(typedTuple.getValue());
}
logger.info("[func=KeywordCrawlerFlow.init][keywordSetSize={}]", keywordSet.size());
logger.info("[func=IncrementCrawlerFlow.init][keywordSetSize={}]", keywordSet.size());
if (keywordSet.isEmpty()) {
return;
}
Set<String> existSubjects = spiderContentService.getAllSubjects();
keywordSet.removeAll(existSubjects);
logger.info("[func=KeywordCrawlerFlow.init][keywordSetSizeRemovedExist={}]", keywordSet.size());
keywordSet = keywordSet.stream().filter(keyword -> !existSubjects.contains(keyword.toLowerCase().trim())).collect(Collectors.toSet());
logger.info("[func=IncrementCrawlerFlow.init][keywordSetSizeRemovedExist={}]", keywordSet.size());
Set<String> invalidKeywords = yhNoSyncSetOperations.members(RedisKeys.YOHO_SEARCH_KEYWORDS_INVALID);
Set<String> invalidKeywords = blackKeywordsMgr.getBlackKeywords();
if (CollectionUtils.isNotEmpty(invalidKeywords)) {
keywordSet = keywordSet.stream().filter(keyword -> !invalidKeywords.contains(keyword.toLowerCase().trim())).collect(Collectors.toSet());
}
logger.info("[func=KeywordCrawlerFlow.init][keywordSetSizeRemovedInvalid={}]", keywordSet.size());
logger.info("[func=IncrementCrawlerFlow.init][keywordSetSizeRemovedInvalid={}]", keywordSet.size());
this.validKeywordList = keywordSet.parallelStream().filter(keyword -> validKeyword(keyword)).collect(Collectors.toList());
logger.info("[func=KeywordCrawlerFlow.init][validKeywordListSize={}]", validKeywordList != null ? validKeywordList.size() : 0);
logger.info("[func=IncrementCrawlerFlow.init][validKeywordListSize={}]", validKeywordList != null ? validKeywordList.size() : 0);
}
private boolean validKeyword(String keyword) {
... ... @@ -141,25 +141,21 @@ public class KeywordCrawlerFlow implements RetryBusinessFlow {
}
}
logger.info("[func=KeywordCrawlerFlow.doBusiness][baikeBOListSize={}][failedKeywords={}]", baikeBOList.size(), failedKeywords);
logger.info("[func=IncrementCrawlerFlow.doBusiness][baikeBOListSize={}][failedKeywords={}]", baikeBOList.size(), failedKeywords);
if (CollectionUtils.isNotEmpty(baikeBOList)) {
List<SpiderContent> spiderContentList = baikeBOList.stream().map(baikeBO -> baikeBO.toSpiderContent()).collect(Collectors.toList());
spiderContentService.insertBatch(spiderContentList);
}
if (!failedKeywords.isEmpty()) {
String[] keywordArray = new String[failedKeywords.size()];
yhNoSyncSetOperations.add(RedisKeys.YOHO_SEARCH_KEYWORDS_INVALID, failedKeywords.toArray(keywordArray));
}
blackKeywordsMgr.addBlackKeywords(failedKeywords);
return true;
}
private BaikeBO crawle(String keyword) {
try {
String url = "http://baike.baidu.com/item/" + URLEncoder.encode(keyword, "UTF-8");
logger.info("[func=KeywordCrawlerFlow][keyword={}][url={}]", keyword, url);
return httpRequestService.get(url);
logger.info("[func=IncrementCrawlerFlow][keyword={}][url={}]", keyword, url);
return spiderBasedHttpRequest.get(url);
} catch (Exception e) {
logger.error("crawle keyword [" + keyword + "] failed!", e);
}
... ... @@ -170,13 +166,13 @@ public class KeywordCrawlerFlow implements RetryBusinessFlow {
@Override
public void finish(boolean doBusinessResult, Exception exception) {
this.validKeywordList = null;
logger.info("[func=KeywordCrawlerFlow.finish][doBusinessResult=" + doBusinessResult + "]", exception);
logger.info("[func=IncrementCrawlerFlow.finish][doBusinessResult=" + doBusinessResult + "]", exception);
}
public static void main(String[] args) throws UnsupportedEncodingException {
final String keyword = "华伦天奴";
String url = "http://baike.baidu.com/item/" + URLEncoder.encode(keyword, "UTF-8");
logger.info("[func=KeywordCrawlerFlow][keyword={}][url={}]", keyword, url);
System.out.println((new HttpRequestService()).get(url));
logger.info("[func=IncrementCrawlerFlow][keyword={}][url={}]", keyword, url);
System.out.println((new SpiderBasedHttpRequest()).get(url));
}
}
... ...
... ... @@ -3,7 +3,7 @@ package com.yoho.search.spider.job;
import com.yoho.search.consumer.common.DynamicConfigService;
import com.yoho.search.consumer.suggests.common.RetryBusinessFlowExecutor;
import com.yoho.search.spider.conversation.SuggestConvertorFlow;
import com.yoho.search.spider.service.KeywordCrawlerFlow;
import com.yoho.search.spider.increment.IncrementCrawlerFlow;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
... ... @@ -19,7 +19,7 @@ public class SpiderJob {
private static final Logger LOGGER = LoggerFactory.getLogger("SEARCH_SPIDER");
@Autowired
private KeywordCrawlerFlow keywordCrawlerFlow;
private IncrementCrawlerFlow incrementCrawlerFlow;
@Autowired
private SuggestConvertorFlow suggestConvertorFlow;
... ... @@ -37,7 +37,7 @@ public class SpiderJob {
return;
}
RetryBusinessFlowExecutor flowExecutor = new RetryBusinessFlowExecutor(keywordCrawlerFlow);
RetryBusinessFlowExecutor flowExecutor = new RetryBusinessFlowExecutor(incrementCrawlerFlow);
boolean result = flowExecutor.execute();
LOGGER.info("[func=crawleEmptySearchKeywords.end][result={}][cost={}]", result, System.currentTimeMillis() - begin);
}
... ...