Authored by Gino Zhang

增加增量爬取单个关键词的接口

@@ -11,6 +11,8 @@ public interface SuggestConversionMapper { @@ -11,6 +11,8 @@ public interface SuggestConversionMapper {
11 11
12 List<SuggestConversion> selectPageList(@Param(value = "offset") Integer offset, @Param(value = "pageSize") Integer pageSize); 12 List<SuggestConversion> selectPageList(@Param(value = "offset") Integer offset, @Param(value = "pageSize") Integer pageSize);
13 13
  14 + SuggestConversion selectBySource(@Param(value = "source") String source);
  15 +
14 List<SuggestConversion> selectAll(); 16 List<SuggestConversion> selectAll();
15 17
16 void insertBatch(List<SuggestConversion> list); 18 void insertBatch(List<SuggestConversion> list);
@@ -22,6 +22,12 @@ @@ -22,6 +22,12 @@
22 from suggest_conversion limit #{offset},#{pageSize} 22 from suggest_conversion limit #{offset},#{pageSize}
23 </select> 23 </select>
24 24
  25 + <select id="selectBySource" resultMap="BaseResultMap" timeout="20000">
  26 + select
  27 + <include refid="Base_Column_List" />
  28 + from suggest_conversion where source = #{source,jdbcType=VARCHAR}
  29 + </select>
  30 +
25 <select id="selectAll" resultMap="BaseResultMap" timeout="20000"> 31 <select id="selectAll" resultMap="BaseResultMap" timeout="20000">
26 select * from suggest_conversion 32 select * from suggest_conversion
27 </select> 33 </select>
@@ -27,6 +27,10 @@ public class SuggestConversionService { @@ -27,6 +27,10 @@ public class SuggestConversionService {
27 return suggestConversionMapper.selectPageList(offset, pageSize); 27 return suggestConversionMapper.selectPageList(offset, pageSize);
28 } 28 }
29 29
  30 + public SuggestConversion selectBySource(String source){
  31 + return suggestConversionMapper.selectBySource(source);
  32 + }
  33 +
30 public Set<String> getAllSources() { 34 public Set<String> getAllSources() {
31 List<SuggestConversion> list = suggestConversionMapper.selectAll(); 35 List<SuggestConversion> list = suggestConversionMapper.selectAll();
32 return list == null ? new HashSet<>() : list.stream().map(SuggestConversion::getSource) 36 return list == null ? new HashSet<>() : list.stream().map(SuggestConversion::getSource)
1 package com.yoho.search.spider.controller; 1 package com.yoho.search.spider.controller;
2 2
3 import com.yoho.search.spider.conversation.SuggestConvertorService; 3 import com.yoho.search.spider.conversation.SuggestConvertorService;
4 -import com.yoho.search.spider.job.SpiderJob;  
5 import com.yoho.search.spider.full.BaikeURLSpiderService; 4 import com.yoho.search.spider.full.BaikeURLSpiderService;
6 import com.yoho.search.spider.increment.BlackKeywordsMgr; 5 import com.yoho.search.spider.increment.BlackKeywordsMgr;
  6 +import com.yoho.search.spider.increment.IncrementCrawlerService;
  7 +import com.yoho.search.spider.job.SpiderJob;
7 import org.springframework.beans.factory.annotation.Autowired; 8 import org.springframework.beans.factory.annotation.Autowired;
8 import org.springframework.stereotype.Controller; 9 import org.springframework.stereotype.Controller;
9 import org.springframework.web.bind.annotation.RequestMapping; 10 import org.springframework.web.bind.annotation.RequestMapping;
@@ -31,6 +32,9 @@ public class BaikeSpiderController { @@ -31,6 +32,9 @@ public class BaikeSpiderController {
31 @Autowired 32 @Autowired
32 private BlackKeywordsMgr blackKeywordsMgr; 33 private BlackKeywordsMgr blackKeywordsMgr;
33 34
  35 + @Autowired
  36 + private IncrementCrawlerService incrementCrawlerService;
  37 +
34 @RequestMapping(value = "/spider/baike") 38 @RequestMapping(value = "/spider/baike")
35 @ResponseBody 39 @ResponseBody
36 public Map<String, Object> baikeSpider() { 40 public Map<String, Object> baikeSpider() {
@@ -41,13 +45,13 @@ public class BaikeSpiderController { @@ -41,13 +45,13 @@ public class BaikeSpiderController {
41 return result; 45 return result;
42 } catch (Exception e) { 46 } catch (Exception e) {
43 Map<String, Object> result = new HashMap(); 47 Map<String, Object> result = new HashMap();
44 - result.put("code", 503); 48 + result.put("code", 500);
45 result.put("message", e.getMessage()); 49 result.put("message", e.getMessage());
46 return result; 50 return result;
47 } 51 }
48 } 52 }
49 53
50 - @RequestMapping(value = "/spider/crawerEmptyKeywords") 54 + @RequestMapping(value = "/spider/crawlerEmptyKeywords")
51 @ResponseBody 55 @ResponseBody
52 public Map<String, Object> crawerEmptyKeywords() { 56 public Map<String, Object> crawerEmptyKeywords() {
53 Map<String, Object> result = new HashMap(); 57 Map<String, Object> result = new HashMap();
@@ -58,7 +62,7 @@ public class BaikeSpiderController { @@ -58,7 +62,7 @@ public class BaikeSpiderController {
58 result.put("message", "success"); 62 result.put("message", "success");
59 return result; 63 return result;
60 } catch (Exception e) { 64 } catch (Exception e) {
61 - result.put("code", 503); 65 + result.put("code", 500);
62 result.put("message", e.getMessage()); 66 result.put("message", e.getMessage());
63 return result; 67 return result;
64 } 68 }
@@ -75,7 +79,7 @@ public class BaikeSpiderController { @@ -75,7 +79,7 @@ public class BaikeSpiderController {
75 result.put("message", "success"); 79 result.put("message", "success");
76 return result; 80 return result;
77 } catch (Exception e) { 81 } catch (Exception e) {
78 - result.put("code", 503); 82 + result.put("code", 500);
79 result.put("message", e.getMessage()); 83 result.put("message", e.getMessage());
80 return result; 84 return result;
81 } 85 }
@@ -93,7 +97,7 @@ public class BaikeSpiderController { @@ -93,7 +97,7 @@ public class BaikeSpiderController {
93 result.put("message", "success"); 97 result.put("message", "success");
94 return result; 98 return result;
95 } catch (Exception e) { 99 } catch (Exception e) {
96 - result.put("code", 503); 100 + result.put("code", 500);
97 result.put("message", e.getMessage()); 101 result.put("message", e.getMessage());
98 return result; 102 return result;
99 } 103 }
@@ -110,7 +114,78 @@ public class BaikeSpiderController { @@ -110,7 +114,78 @@ public class BaikeSpiderController {
110 result.put("message", "success"); 114 result.put("message", "success");
111 return result; 115 return result;
112 } catch (Exception e) { 116 } catch (Exception e) {
113 - result.put("code", 503); 117 + result.put("code", 500);
  118 + result.put("message", e.getMessage());
  119 + return result;
  120 + }
  121 + }
  122 +
  123 + @RequestMapping(value = "/spider/single/crawle")
  124 + @ResponseBody
  125 + public Map<String, Object> singleCrawle(@RequestParam String keyword) {
  126 + Map<String, Object> result = new HashMap();
  127 +
  128 + try {
  129 + result.put("result", incrementCrawlerService.crawleKeyword(keyword));
  130 + result.put("code", 200);
  131 + result.put("message", "success");
  132 + return result;
  133 + } catch (Exception e) {
  134 + result.put("code", 500);
  135 + result.put("message", e.getMessage());
  136 + return result;
  137 + }
  138 + }
  139 +
  140 + @RequestMapping(value = "/spider/single/convertor")
  141 + @ResponseBody
  142 + public Map<String, Object> singleConvertor(@RequestParam String keyword) {
  143 + Map<String, Object> result = new HashMap();
  144 +
  145 + try {
  146 + result.put("result", suggestConvertorService.convertSingleKeyword(keyword));
  147 + result.put("code", 200);
  148 + result.put("message", "success");
  149 + return result;
  150 + } catch (Exception e) {
  151 + result.put("code", 500);
  152 + result.put("message", e.getMessage());
  153 + return result;
  154 + }
  155 + }
  156 +
  157 + @RequestMapping(value = "/spider/single/updateES")
  158 + @ResponseBody
  159 + public Map<String, Object> singleUpdateES(@RequestParam String keyword) {
  160 + Map<String, Object> result = new HashMap();
  161 +
  162 + try {
  163 + result.put("result", suggestConvertorService.addOrUpdateIndex(keyword));
  164 + result.put("code", 200);
  165 + result.put("message", "success");
  166 + return result;
  167 + } catch (Exception e) {
  168 + result.put("code", 500);
  169 + result.put("message", e.getMessage());
  170 + return result;
  171 + }
  172 + }
  173 +
  174 + @RequestMapping(value = "/spider/single/increment")
  175 + @ResponseBody
  176 + public Map<String, Object> singleIncrement(@RequestParam String keyword) {
  177 + Map<String, Object> result = new HashMap();
  178 +
  179 + try {
  180 + boolean incmentResult = incrementCrawlerService.crawleKeyword(keyword)
  181 + && suggestConvertorService.convertSingleKeyword(keyword)
  182 + && suggestConvertorService.addOrUpdateIndex(keyword);
  183 + result.put("result", incmentResult);
  184 + result.put("code", 200);
  185 + result.put("message", "success");
  186 + return result;
  187 + } catch (Exception e) {
  188 + result.put("code", 500);
114 result.put("message", e.getMessage()); 189 result.put("message", e.getMessage());
115 return result; 190 return result;
116 } 191 }
1 package com.yoho.search.spider.conversation; 1 package com.yoho.search.spider.conversation;
2 2
  3 +import com.yoho.search.base.utils.DateUtil;
  4 +import com.yoho.search.base.utils.ISearchConstants;
3 import com.yoho.search.consumer.index.common.AnalyzerHelper; 5 import com.yoho.search.consumer.index.common.AnalyzerHelper;
  6 +import com.yoho.search.consumer.index.common.IYohoIndexService;
4 import com.yoho.search.consumer.service.base.*; 7 import com.yoho.search.consumer.service.base.*;
5 import com.yoho.search.consumer.suggests.common.SuggestionConstants; 8 import com.yoho.search.consumer.suggests.common.SuggestionConstants;
6 import com.yoho.search.dal.model.*; 9 import com.yoho.search.dal.model.*;
7 import org.apache.commons.collections.CollectionUtils; 10 import org.apache.commons.collections.CollectionUtils;
8 -import org.apache.commons.collections.map.HashedMap;  
9 import org.apache.commons.lang.StringUtils; 11 import org.apache.commons.lang.StringUtils;
10 import org.slf4j.Logger; 12 import org.slf4j.Logger;
11 import org.slf4j.LoggerFactory; 13 import org.slf4j.LoggerFactory;
12 import org.springframework.beans.factory.annotation.Autowired; 14 import org.springframework.beans.factory.annotation.Autowired;
13 import org.springframework.stereotype.Component; 15 import org.springframework.stereotype.Component;
  16 +import org.springframework.util.Assert;
14 17
15 import java.util.*; 18 import java.util.*;
16 import java.util.stream.Collectors; 19 import java.util.stream.Collectors;
@@ -43,6 +46,12 @@ public class SuggestConvertorService { @@ -43,6 +46,12 @@ public class SuggestConvertorService {
43 @Autowired 46 @Autowired
44 private SpiderContentService spiderContentService; 47 private SpiderContentService spiderContentService;
45 48
  49 + @Autowired
  50 + private SuggestConversionService suggestConversionService;
  51 +
  52 + @Autowired
  53 + private IYohoIndexService indexService;
  54 +
46 public YohoKeywordsBO buildYohoKeywordBO() { 55 public YohoKeywordsBO buildYohoKeywordBO() {
47 Set<String> keywordSet = new HashSet<>(3000); 56 Set<String> keywordSet = new HashSet<>(3000);
48 Set<String> filterSortNameSet = new HashSet<>(300); 57 Set<String> filterSortNameSet = new HashSet<>(300);
@@ -188,8 +197,48 @@ public class SuggestConvertorService { @@ -188,8 +197,48 @@ public class SuggestConvertorService {
188 return score / keywordTokens.size(); 197 return score / keywordTokens.size();
189 } 198 }
190 199
  200 + public boolean convertSingleKeyword(String keyword) {
  201 + Assert.notNull(keyword);
  202 + SpiderContent spiderContent = spiderContentService.selectSpiderContentBySubject(keyword);
  203 + if (spiderContent == null) {
  204 + return false;
  205 + }
  206 +
  207 + YohoKeywordsBO yohoKeywordsBO = buildYohoKeywordBO();
  208 + String dest = convert(spiderContent, yohoKeywordsBO, new HashMap());
  209 + if (StringUtils.isNotEmpty(dest)) {
  210 + SuggestConversion suggestConversion = new SuggestConversion();
  211 + suggestConversion.setSource(spiderContent.getSubject());
  212 + suggestConversion.setDest(dest);
  213 + suggestConversion.setCreateTime(DateUtil.getCurrentTimeSecond());
  214 + suggestConversionService.insertBatch(Arrays.asList(suggestConversion));
  215 + return true;
  216 + }
  217 +
  218 + return false;
  219 + }
  220 +
  221 + public boolean addOrUpdateIndex(String keyword) {
  222 + Assert.notNull(keyword);
  223 + SuggestConversion suggestConversion = suggestConversionService.selectBySource(keyword);
  224 + if (suggestConversion == null) {
  225 + return false;
  226 + }
  227 +
  228 + try {
  229 + Map<String, String> dataMap = new HashMap<>();
  230 + dataMap.put("source", suggestConversion.getSource());
  231 + dataMap.put("dest", suggestConversion.getDest());
  232 + indexService.updateIndexData(ISearchConstants.INDEX_NAME_CONVERSION, suggestConversion.getId().toString(), dataMap);
  233 + return true;
  234 + } catch (Exception e) {
  235 + logger.error(e.getMessage(), e);
  236 + return false;
  237 + }
  238 + }
  239 +
191 public Map<String, Object> explainConversion(String keyword) { 240 public Map<String, Object> explainConversion(String keyword) {
192 - Map<String, Object> resultMap = new HashedMap(); 241 + Map<String, Object> resultMap = new HashMap();
193 SpiderContent spiderContent = spiderContentService.selectSpiderContentBySubject(keyword); 242 SpiderContent spiderContent = spiderContentService.selectSpiderContentBySubject(keyword);
194 if (spiderContent == null) { 243 if (spiderContent == null) {
195 throw new RuntimeException("The spider content doesn't exist"); 244 throw new RuntimeException("The spider content doesn't exist");
@@ -47,10 +47,10 @@ public class IncrementCrawlerFlow implements RetryBusinessFlow { @@ -47,10 +47,10 @@ public class IncrementCrawlerFlow implements RetryBusinessFlow {
47 private AnalyzerHelper analyzerHelper; 47 private AnalyzerHelper analyzerHelper;
48 48
49 @Autowired 49 @Autowired
50 - private SpiderBasedHttpRequest spiderBasedHttpRequest; 50 + private BlackKeywordsMgr blackKeywordsMgr;
51 51
52 @Autowired 52 @Autowired
53 - private BlackKeywordsMgr blackKeywordsMgr; 53 + private IncrementCrawlerService incrementCrawlerService;
54 54
55 private List<String> validKeywordList = null; 55 private List<String> validKeywordList = null;
56 56
@@ -133,7 +133,7 @@ public class IncrementCrawlerFlow implements RetryBusinessFlow { @@ -133,7 +133,7 @@ public class IncrementCrawlerFlow implements RetryBusinessFlow {
133 List<BaikeBO> baikeBOList = new ArrayList<>(); 133 List<BaikeBO> baikeBOList = new ArrayList<>();
134 BaikeBO tempBaikeBO; 134 BaikeBO tempBaikeBO;
135 for (String keyword : subListKeywords) { 135 for (String keyword : subListKeywords) {
136 - if ((tempBaikeBO = crawle(keyword)) != null) { 136 + if ((tempBaikeBO = incrementCrawlerService.doCrawle(keyword)) != null) {
137 tempBaikeBO.setTitle(keyword); 137 tempBaikeBO.setTitle(keyword);
138 baikeBOList.add(tempBaikeBO); 138 baikeBOList.add(tempBaikeBO);
139 } else { 139 } else {
@@ -151,18 +151,6 @@ public class IncrementCrawlerFlow implements RetryBusinessFlow { @@ -151,18 +151,6 @@ public class IncrementCrawlerFlow implements RetryBusinessFlow {
151 return true; 151 return true;
152 } 152 }
153 153
154 - private BaikeBO crawle(String keyword) {  
155 - try {  
156 - String url = "http://baike.baidu.com/item/" + URLEncoder.encode(keyword, "UTF-8");  
157 - logger.info("[func=IncrementCrawlerFlow][keyword={}][url={}]", keyword, url);  
158 - return spiderBasedHttpRequest.get(url);  
159 - } catch (Exception e) {  
160 - logger.error("crawle keyword [" + keyword + "] failed!", e);  
161 - }  
162 -  
163 - return null;  
164 - }  
165 -  
166 @Override 154 @Override
167 public void finish(boolean doBusinessResult, Exception exception) { 155 public void finish(boolean doBusinessResult, Exception exception) {
168 this.validKeywordList = null; 156 this.validKeywordList = null;
  1 +package com.yoho.search.spider.increment;
  2 +
  3 +import com.yoho.search.consumer.service.base.SpiderContentService;
  4 +import com.yoho.search.dal.model.SpiderContent;
  5 +import com.yoho.search.spider.common.BaikeBO;
  6 +import com.yoho.search.spider.common.SpiderBasedHttpRequest;
  7 +import org.apache.commons.lang.StringUtils;
  8 +import org.slf4j.Logger;
  9 +import org.slf4j.LoggerFactory;
  10 +import org.springframework.beans.factory.annotation.Autowired;
  11 +import org.springframework.stereotype.Component;
  12 +import org.springframework.util.Assert;
  13 +
  14 +import java.net.URLEncoder;
  15 +import java.util.Arrays;
  16 +
  17 +/**
  18 + * Created by ginozhang on 2017/3/3.
  19 + */
  20 +@Component
  21 +public class IncrementCrawlerService {
  22 +
  23 + private static final Logger logger = LoggerFactory.getLogger("SEARCH_SPIDER");
  24 +
  25 + @Autowired
  26 + private SpiderContentService spiderContentService;
  27 +
  28 + @Autowired
  29 + private SpiderBasedHttpRequest spiderBasedHttpRequest;
  30 +
  31 + public boolean crawleKeyword(String keyword) {
  32 + Assert.isTrue(StringUtils.isNotEmpty(keyword));
  33 + SpiderContent spiderContent = spiderContentService.selectSpiderContentBySubject(keyword);
  34 + if (spiderContent != null) {
  35 + return true;
  36 + }
  37 +
  38 + BaikeBO baikeBO = doCrawle(keyword);
  39 + if (baikeBO != null) {
  40 + spiderContentService.insertBatch(Arrays.asList(baikeBO.toSpiderContent()));
  41 + return true;
  42 + }
  43 +
  44 + return false;
  45 + }
  46 +
  47 +
  48 + public BaikeBO doCrawle(String keyword) {
  49 + try {
  50 + String url = "http://baike.baidu.com/item/" + URLEncoder.encode(keyword, "UTF-8");
  51 + logger.info("[func=IncrementCrawlerFlow][keyword={}][url={}]", keyword, url);
  52 + return spiderBasedHttpRequest.get(url);
  53 + } catch (Exception e) {
  54 + logger.error("crawle keyword [" + keyword + "] failed!", e);
  55 + }
  56 +
  57 + return null;
  58 + }
  59 +
  60 +}