Authored by Gino Zhang

增加增量爬取单个关键词的接口

... ... @@ -11,6 +11,8 @@ public interface SuggestConversionMapper {
List<SuggestConversion> selectPageList(@Param(value = "offset") Integer offset, @Param(value = "pageSize") Integer pageSize);
SuggestConversion selectBySource(@Param(value = "source") String source);
List<SuggestConversion> selectAll();
void insertBatch(List<SuggestConversion> list);
... ...
... ... @@ -22,6 +22,12 @@
from suggest_conversion limit #{offset},#{pageSize}
</select>
<select id="selectBySource" resultMap="BaseResultMap" timeout="20000">
select
<include refid="Base_Column_List" />
from suggest_conversion where source = #{source,jdbcType=VARCHAR}
</select>
<select id="selectAll" resultMap="BaseResultMap" timeout="20000">
select * from suggest_conversion
</select>
... ...
... ... @@ -27,6 +27,10 @@ public class SuggestConversionService {
return suggestConversionMapper.selectPageList(offset, pageSize);
}
public SuggestConversion selectBySource(String source){
return suggestConversionMapper.selectBySource(source);
}
public Set<String> getAllSources() {
List<SuggestConversion> list = suggestConversionMapper.selectAll();
return list == null ? new HashSet<>() : list.stream().map(SuggestConversion::getSource)
... ...
package com.yoho.search.spider.controller;
import com.yoho.search.spider.conversation.SuggestConvertorService;
import com.yoho.search.spider.job.SpiderJob;
import com.yoho.search.spider.full.BaikeURLSpiderService;
import com.yoho.search.spider.increment.BlackKeywordsMgr;
import com.yoho.search.spider.increment.IncrementCrawlerService;
import com.yoho.search.spider.job.SpiderJob;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.RequestMapping;
... ... @@ -31,6 +32,9 @@ public class BaikeSpiderController {
@Autowired
private BlackKeywordsMgr blackKeywordsMgr;
@Autowired
private IncrementCrawlerService incrementCrawlerService;
@RequestMapping(value = "/spider/baike")
@ResponseBody
public Map<String, Object> baikeSpider() {
... ... @@ -41,13 +45,13 @@ public class BaikeSpiderController {
return result;
} catch (Exception e) {
Map<String, Object> result = new HashMap();
result.put("code", 503);
result.put("code", 500);
result.put("message", e.getMessage());
return result;
}
}
@RequestMapping(value = "/spider/crawerEmptyKeywords")
@RequestMapping(value = "/spider/crawlerEmptyKeywords")
@ResponseBody
public Map<String, Object> crawerEmptyKeywords() {
Map<String, Object> result = new HashMap();
... ... @@ -58,7 +62,7 @@ public class BaikeSpiderController {
result.put("message", "success");
return result;
} catch (Exception e) {
result.put("code", 503);
result.put("code", 500);
result.put("message", e.getMessage());
return result;
}
... ... @@ -75,7 +79,7 @@ public class BaikeSpiderController {
result.put("message", "success");
return result;
} catch (Exception e) {
result.put("code", 503);
result.put("code", 500);
result.put("message", e.getMessage());
return result;
}
... ... @@ -93,7 +97,7 @@ public class BaikeSpiderController {
result.put("message", "success");
return result;
} catch (Exception e) {
result.put("code", 503);
result.put("code", 500);
result.put("message", e.getMessage());
return result;
}
... ... @@ -110,7 +114,78 @@ public class BaikeSpiderController {
result.put("message", "success");
return result;
} catch (Exception e) {
result.put("code", 503);
result.put("code", 500);
result.put("message", e.getMessage());
return result;
}
}
@RequestMapping(value = "/spider/single/crawle")
@ResponseBody
public Map<String, Object> singleCrawle(@RequestParam String keyword) {
Map<String, Object> result = new HashMap();
try {
result.put("result", incrementCrawlerService.crawleKeyword(keyword));
result.put("code", 200);
result.put("message", "success");
return result;
} catch (Exception e) {
result.put("code", 500);
result.put("message", e.getMessage());
return result;
}
}
@RequestMapping(value = "/spider/single/convertor")
@ResponseBody
public Map<String, Object> singleConvertor(@RequestParam String keyword) {
Map<String, Object> result = new HashMap();
try {
result.put("result", suggestConvertorService.convertSingleKeyword(keyword));
result.put("code", 200);
result.put("message", "success");
return result;
} catch (Exception e) {
result.put("code", 500);
result.put("message", e.getMessage());
return result;
}
}
@RequestMapping(value = "/spider/single/updateES")
@ResponseBody
public Map<String, Object> singleUpdateES(@RequestParam String keyword) {
Map<String, Object> result = new HashMap();
try {
result.put("result", suggestConvertorService.addOrUpdateIndex(keyword));
result.put("code", 200);
result.put("message", "success");
return result;
} catch (Exception e) {
result.put("code", 500);
result.put("message", e.getMessage());
return result;
}
}
@RequestMapping(value = "/spider/single/increment")
@ResponseBody
public Map<String, Object> singleIncrement(@RequestParam String keyword) {
Map<String, Object> result = new HashMap();
try {
boolean incmentResult = incrementCrawlerService.crawleKeyword(keyword)
&& suggestConvertorService.convertSingleKeyword(keyword)
&& suggestConvertorService.addOrUpdateIndex(keyword);
result.put("result", incmentResult);
result.put("code", 200);
result.put("message", "success");
return result;
} catch (Exception e) {
result.put("code", 500);
result.put("message", e.getMessage());
return result;
}
... ...
package com.yoho.search.spider.conversation;
import com.yoho.search.base.utils.DateUtil;
import com.yoho.search.base.utils.ISearchConstants;
import com.yoho.search.consumer.index.common.AnalyzerHelper;
import com.yoho.search.consumer.index.common.IYohoIndexService;
import com.yoho.search.consumer.service.base.*;
import com.yoho.search.consumer.suggests.common.SuggestionConstants;
import com.yoho.search.dal.model.*;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.collections.map.HashedMap;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import org.springframework.util.Assert;
import java.util.*;
import java.util.stream.Collectors;
... ... @@ -43,6 +46,12 @@ public class SuggestConvertorService {
@Autowired
private SpiderContentService spiderContentService;
@Autowired
private SuggestConversionService suggestConversionService;
@Autowired
private IYohoIndexService indexService;
public YohoKeywordsBO buildYohoKeywordBO() {
Set<String> keywordSet = new HashSet<>(3000);
Set<String> filterSortNameSet = new HashSet<>(300);
... ... @@ -188,8 +197,48 @@ public class SuggestConvertorService {
return score / keywordTokens.size();
}
public boolean convertSingleKeyword(String keyword) {
Assert.notNull(keyword);
SpiderContent spiderContent = spiderContentService.selectSpiderContentBySubject(keyword);
if (spiderContent == null) {
return false;
}
YohoKeywordsBO yohoKeywordsBO = buildYohoKeywordBO();
String dest = convert(spiderContent, yohoKeywordsBO, new HashMap());
if (StringUtils.isNotEmpty(dest)) {
SuggestConversion suggestConversion = new SuggestConversion();
suggestConversion.setSource(spiderContent.getSubject());
suggestConversion.setDest(dest);
suggestConversion.setCreateTime(DateUtil.getCurrentTimeSecond());
suggestConversionService.insertBatch(Arrays.asList(suggestConversion));
return true;
}
return false;
}
public boolean addOrUpdateIndex(String keyword) {
Assert.notNull(keyword);
SuggestConversion suggestConversion = suggestConversionService.selectBySource(keyword);
if (suggestConversion == null) {
return false;
}
try {
Map<String, String> dataMap = new HashMap<>();
dataMap.put("source", suggestConversion.getSource());
dataMap.put("dest", suggestConversion.getDest());
indexService.updateIndexData(ISearchConstants.INDEX_NAME_CONVERSION, suggestConversion.getId().toString(), dataMap);
return true;
} catch (Exception e) {
logger.error(e.getMessage(), e);
return false;
}
}
public Map<String, Object> explainConversion(String keyword) {
Map<String, Object> resultMap = new HashedMap();
Map<String, Object> resultMap = new HashMap();
SpiderContent spiderContent = spiderContentService.selectSpiderContentBySubject(keyword);
if (spiderContent == null) {
throw new RuntimeException("The spider content doesn't exist");
... ...
... ... @@ -47,10 +47,10 @@ public class IncrementCrawlerFlow implements RetryBusinessFlow {
private AnalyzerHelper analyzerHelper;
@Autowired
private SpiderBasedHttpRequest spiderBasedHttpRequest;
private BlackKeywordsMgr blackKeywordsMgr;
@Autowired
private BlackKeywordsMgr blackKeywordsMgr;
private IncrementCrawlerService incrementCrawlerService;
private List<String> validKeywordList = null;
... ... @@ -133,7 +133,7 @@ public class IncrementCrawlerFlow implements RetryBusinessFlow {
List<BaikeBO> baikeBOList = new ArrayList<>();
BaikeBO tempBaikeBO;
for (String keyword : subListKeywords) {
if ((tempBaikeBO = crawle(keyword)) != null) {
if ((tempBaikeBO = incrementCrawlerService.doCrawle(keyword)) != null) {
tempBaikeBO.setTitle(keyword);
baikeBOList.add(tempBaikeBO);
} else {
... ... @@ -151,18 +151,6 @@ public class IncrementCrawlerFlow implements RetryBusinessFlow {
return true;
}
private BaikeBO crawle(String keyword) {
try {
String url = "http://baike.baidu.com/item/" + URLEncoder.encode(keyword, "UTF-8");
logger.info("[func=IncrementCrawlerFlow][keyword={}][url={}]", keyword, url);
return spiderBasedHttpRequest.get(url);
} catch (Exception e) {
logger.error("crawle keyword [" + keyword + "] failed!", e);
}
return null;
}
@Override
public void finish(boolean doBusinessResult, Exception exception) {
this.validKeywordList = null;
... ...
package com.yoho.search.spider.increment;
import com.yoho.search.consumer.service.base.SpiderContentService;
import com.yoho.search.dal.model.SpiderContent;
import com.yoho.search.spider.common.BaikeBO;
import com.yoho.search.spider.common.SpiderBasedHttpRequest;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import org.springframework.util.Assert;
import java.net.URLEncoder;
import java.util.Arrays;
/**
* Created by ginozhang on 2017/3/3.
*/
@Component
public class IncrementCrawlerService {
private static final Logger logger = LoggerFactory.getLogger("SEARCH_SPIDER");
@Autowired
private SpiderContentService spiderContentService;
@Autowired
private SpiderBasedHttpRequest spiderBasedHttpRequest;
public boolean crawleKeyword(String keyword) {
Assert.isTrue(StringUtils.isNotEmpty(keyword));
SpiderContent spiderContent = spiderContentService.selectSpiderContentBySubject(keyword);
if (spiderContent != null) {
return true;
}
BaikeBO baikeBO = doCrawle(keyword);
if (baikeBO != null) {
spiderContentService.insertBatch(Arrays.asList(baikeBO.toSpiderContent()));
return true;
}
return false;
}
public BaikeBO doCrawle(String keyword) {
try {
String url = "http://baike.baidu.com/item/" + URLEncoder.encode(keyword, "UTF-8");
logger.info("[func=IncrementCrawlerFlow][keyword={}][url={}]", keyword, url);
return spiderBasedHttpRequest.get(url);
} catch (Exception e) {
logger.error("crawle keyword [" + keyword + "] failed!", e);
}
return null;
}
}
... ...