Commit 099f58c5 by 王明范

Merge branch 'TECH-SEARCH-merge-keyword-tag' into 'master'

Tech search merge keyword tag

See merge request !4
parents c31088f6 70d2074b
...@@ -36,7 +36,41 @@ ...@@ -36,7 +36,41 @@
</profiles> </profiles>
<dependencies> <dependencies>
<dependency>
<groupId>com.secoo</groupId>
<artifactId>sqp4j-client</artifactId>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
<exclusion>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</exclusion>
<exclusion>
<artifactId>secoo-log</artifactId>
<groupId>com.secoo.mall</groupId>
</exclusion>
</exclusions>
<version>2.9.6.RELEASE</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>dubbo</artifactId>
<version>2.6.0</version>
<exclusions>
<exclusion>
<artifactId>spring-context</artifactId>
<groupId>org.springframework</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>com.github.sgroschupf</groupId>
<artifactId>zkclient</artifactId>
<version>0.1</version>
</dependency>
<dependency> <dependency>
<groupId>mysql</groupId> <groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId> <artifactId>mysql-connector-java</artifactId>
......
package com.secoo.so.suggest.client;
import com.alibaba.dubbo.config.ApplicationConfig;
import com.alibaba.dubbo.config.ReferenceConfig;
import com.alibaba.dubbo.config.RegistryConfig;
import com.secoo.search.sqp4j.QueryPlan;
import org.apache.log4j.Logger;
import java.util.Map;
/**
* @author wangmingfan
* @date 2020/8/17
* @description sqp dubbo client
*/
public class SqpDubboClient {
private static final Logger loger = Logger.getLogger(SqpDubboClient.class);
private static ReferenceConfig<QueryPlan> dubboSqpReferenceConfigProd = null;
private static ReferenceConfig<QueryPlan> dubboSqpReferenceConfigUat = null;
private static ReferenceConfig<QueryPlan> dubboSqpReferenceConfigTest = null;
public static QueryPlan getDirectImpl(String url, Map<String, String> map){
ReferenceConfig<QueryPlan> impl = directDubboSqpReferenceConfig(url);
QueryPlan dubboSqp = impl.get();
map.put("Client", impl.getClient());
map.put("Interface", impl.getInterface());
map.put("Protocol", impl.getProtocol());
map.put("Url", impl.getUrl());
map.put("Cluster", impl.getCluster());
return dubboSqp;
}
public static QueryPlan getTestImpl(){
ReferenceConfig<QueryPlan> impl = getTestDubboSqpReferenceConfig();
//impl.setGroup(""); //测试test环境未设置group
QueryPlan dubboSqp = impl.get();
return dubboSqp;
}
public static QueryPlan getUatImpl() {
ReferenceConfig<QueryPlan> impl = getDubboSqpReferenceConfigUat();
QueryPlan dubboSqp = impl.get();
return dubboSqp;
}
public static QueryPlan getProdImpl() {
ReferenceConfig<QueryPlan> impl = getDubboSqpReferenceConfigProd();
QueryPlan dubboSqp = impl.get();
return dubboSqp;
}
private static ReferenceConfig<QueryPlan> getDubboSqpReferenceConfigProd() {
if (dubboSqpReferenceConfigProd == null) {
dubboSqpReferenceConfigProd = buildDubboSqpReferenceConfig();
}
return dubboSqpReferenceConfigProd;
}
private static ReferenceConfig<QueryPlan> getDubboSqpReferenceConfigUat() {
if (dubboSqpReferenceConfigUat == null) {
dubboSqpReferenceConfigUat = buildDubboSqpReferenceConfig();
dubboSqpReferenceConfigUat.setGroup("grey");
}
return dubboSqpReferenceConfigUat;
}
private static ReferenceConfig<QueryPlan> buildDubboSqpReferenceConfig() {
ReferenceConfig<QueryPlan> impl = new ReferenceConfig<QueryPlan>();
impl.setProtocol("dubbo");
impl.setApplication(new ApplicationConfig("sem-test-tool"));
RegistryConfig registryConfig = new RegistryConfig("zookeeper://zk-mall1.secoolocal.com:5181?backup=zk-mall2.secoolocal.com:5181,zk-mall3.secoolocal.com:5181");
registryConfig.setProtocol("zookeeper");
registryConfig.setClient("zkclient");
impl.setRegistry(registryConfig);
// impl.setMonitor(new MonitorConfig() { { setProtocol("registry"); } });
impl.setVersion("1.0.0");
impl.setInterface("com.secoo.search.sqp4j.QueryPlan");
return impl;
}
/**
* 连接测试环境dubbo
* @return com.alibaba.dubbo.config.ReferenceConfig<com.secoo.search.sqp4j.QueryPlan>
* @author wangmingfan
* @date 2020/8/17
*/
private static ReferenceConfig<QueryPlan> getTestDubboSqpReferenceConfig() {
if (dubboSqpReferenceConfigTest == null) {
ReferenceConfig<QueryPlan> impl = new ReferenceConfig<QueryPlan>();
impl.setProtocol("dubbo");
impl.setApplication(new ApplicationConfig("sem-test-tool"));
RegistryConfig registryConfig = new RegistryConfig("zookeeper://10.185.240.81:2181?backup=10.185.240.82:2181,10.185.240.83:2181");
registryConfig.setProtocol("zookeeper");
registryConfig.setClient("zkclient");
impl.setRegistry(registryConfig);
// impl.setMonitor(new MonitorConfig() { { setProtocol("registry"); } });
impl.setVersion("1.0.0");
impl.setInterface("com.secoo.search.sqp4j.QueryPlan");
dubboSqpReferenceConfigTest = impl;
}
return dubboSqpReferenceConfigTest;
}
/**
* 直连dubbo服务
* @param Url url格式:ip:端口号
* @return com.alibaba.dubbo.config.ReferenceConfig<com.secoo.search.sqp4j.QueryPlan>
* @author wangmingfan
* @date 2020/4/6
*/
private static ReferenceConfig<QueryPlan> directDubboSqpReferenceConfig(String Url) {
ReferenceConfig<QueryPlan> impl = new ReferenceConfig<QueryPlan>();
impl.setProtocol("dubbo");
impl.setApplication(new ApplicationConfig("sem-test-tool"));
impl.setUrl(Url); //"10.185.240.158:20062"
//impl.setMonitor(new MonitorConfig() { { setProtocol("registry"); } });
impl.setVersion("1.0.0");
impl.setInterface("com.secoo.search.sqp4j.QueryPlan");
return impl;
}
}
package com.secoo.so.suggest.helper;
import com.secoo.abtest.common.Buckets;
import com.secoo.search.sqp4j.Explanation;
import com.secoo.search.sqp4j.Explanations;
import com.secoo.search.sqp4j.QueryPlan;
import com.secoo.search.sqp4j.QueryWord;
import com.secoo.search.sqp4j.client.QueryPlanClient;
import com.secoo.so.suggest.client.SqpDubboClient;
import com.secoo.so.suggest.config.ConfigUtil;
import com.secoo.so.suggest.util.FileUtils;
import com.secoo.so.suggest.util.StringUtils;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.*;
/**
* @author wangmingfan
* @date 2022/8/4
* @description
*/
public class QueryPlanHelper {
private static final Logger LOG = LoggerFactory.getLogger(QueryPlanHelper.class);
private static volatile QueryPlanHelper instance;
private static QueryPlan client = null;
Map<String, Explanation> sqpCache = new HashMap<>();
List<String> wordList = new ArrayList<>();
Map<String, Integer> keywordMap = new HashMap<>();
private static String queryPlanFile = ""; ///data/crontab/test/tmp/queryplan.txt
private static List<String> newWordLines = new ArrayList<>();
private static long minTimeStamp = Long.MAX_VALUE;
private QueryPlanHelper() {
queryPlanFile = ConfigUtil.getString("queryPlan.cachePath", "");
LOG.info("debugLog queryPlanFile path is " + queryPlanFile);
client = SqpDubboClient.getProdImpl();
loadQueryPlanFromFile();
}
public static QueryPlanHelper getInstance() {
if (instance == null) {
synchronized (QueryPlanHelper.class) {
if (instance == null) {
instance = new QueryPlanHelper();
}
}
}
return instance;
}
public int explainQueryWordCount(String keyword) {
int count = 0;
if (StringUtils.isNotBlank(keyword)) {
if (keywordMap.containsKey(keyword)) {
return keywordMap.get(keyword);
}
String traceId = UUID.randomUUID().toString();
Map<String, String> bucketInfo = new HashMap<>();
Buckets bucket = new Buckets(bucketInfo);
String cityCode = "";
long currDate = 0L;
int needSpell = 0;
Explanations explanations = client.explain(traceId, bucket, cityCode, currDate, needSpell, keyword, null);
if (explanations != null && explanations.getItems().size() > 0 && explanations.getItems().get(0) != null) {
Explanation explanation = explanations.getItems().get(0);
if (explanation.getQueryWords() != null) {
int wordCount = explanation.getQueryWords().size();
keywordMap.put(keyword, wordCount);
newWordLines.add(keyword + "," + wordCount + "," + (System.currentTimeMillis() /1000) );
return wordCount;
}
}
}
return count;
}
private void loadQueryPlanFromFile() {
if (StringUtils.isNotBlank(queryPlanFile)) {
List<String> lines = FileUtils.readLines(queryPlanFile);
if (lines != null && lines.size() > 0) {
for (String line : lines) {
if (StringUtils.isBlank(line)) {
continue;
}
String[] arr = line.split(",");
if (arr.length == 3) {
String keyword = arr[0];
String strWordCount = arr[1];
String ts = arr[2];
if (StringUtils.isNotBlank(keyword) && StringUtils.isNumber(strWordCount) && StringUtils.isNumber(ts)) {
try {
long timeStamp = Long.valueOf(ts);
if (timeStamp < minTimeStamp) {
minTimeStamp = timeStamp;
}
int wordCount = Integer.valueOf(strWordCount);
keywordMap.put(keyword, wordCount);
} catch (Exception e) {
LOG.info("string to integer exception,", e);
}
}
}
}
}
}
}
public void writeQueryPlanToFile() {
if (StringUtils.isBlank(queryPlanFile)) {
return;
}
long nowSecond = System.currentTimeMillis()/1000;
long sevenDays = 3600 * 24 * 7;
if (nowSecond - minTimeStamp > sevenDays) {
// 文件中最早的时间戳超过7天,全量更新;否则只更新新增的
if (keywordMap.size() > 0) {
newWordLines = new ArrayList<>(); // map转存到newWordLines
for (Map.Entry<String, Integer> entry : keywordMap.entrySet()) {
String line = entry.getKey() + "," + entry.getValue() + "," + nowSecond;
newWordLines.add(line);
}
}
}
if (newWordLines != null && newWordLines.size() > 0) {
FileUtils.saveToFile(newWordLines, queryPlanFile, true);
newWordLines = new ArrayList<>();
}
}
public Explanation explain(String keyword) {
if (StringUtils.isNotBlank(keyword)) {
if (sqpCache.containsKey(keyword)) {
return sqpCache.get(keyword);
}
String traceId = UUID.randomUUID().toString();
Map<String, String> bucketInfo = new HashMap<>();
Buckets bucket = new Buckets(bucketInfo);
String cityCode = "";
long currDate = 0L;
int needSpell = 0;
Explanations explanations = client.explain(traceId, bucket, cityCode, currDate, needSpell, keyword, null);
if (explanations != null && explanations.getItems().size() > 0 && explanations.getItems().get(0) != null) {
Explanation explanation = explanations.getItems().get(0);
cacheKeyword(keyword, explanation);
return explanation;
}
}
return null;
}
private void cacheKeyword(String keyword, Explanation explanation) {
sqpCache.put(keyword, explanation);
wordList.add(keyword);
if (sqpCache.size() > 100000) {
String removeWord = wordList.get(0);
wordList.remove(0);
sqpCache.remove(removeWord);
}
}
public static void main(String[] arg) {
Map<String, String> cache = new HashMap<>();
List<String> list = new ArrayList<>();
for(int i=0; i< 7;i++) {
cache.put("key_" + i, String.valueOf(i));
list.add("key_" + i);
if (cache.size()>5) {
String rk = list.get(0);
list.remove(0);
cache.remove(rk);
}
System.out.println("list size:"+list.size()+";map size:" + cache.size());
}
}
}
package com.secoo.so.suggest.helper;
import com.secoo.so.suggest.config.ConfigUtil;
import com.secoo.so.suggest.util.FileUtils;
import com.secoo.so.suggest.util.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/**
* @author wangmingfan
* @date 2022/8/8
* @description
*/
public class WordHelper {
private static final Logger LOG = LoggerFactory.getLogger(WordHelper.class);
private static final String specialWordFile = "specialWordPath";
private static final String synonymTagFile = "synonymTagPath";
/**
* 从文件中加载部分特定不进行分割的词
* 文件中每行一个词语
* @return java.util.Set<java.lang.String>
* @date 2022/8/8
*/
public static Set<String> loadSpecialWords() {
String specialWordPath = ConfigUtil.getString(specialWordFile);
LOG.info("debugLog specialWordFile path is " + specialWordPath);
Set<String> words = new HashSet<>();
if (StringUtils.isNotBlank(specialWordPath)) {
List<String> lines = FileUtils.readLines(specialWordPath);
if (lines != null && lines.size() > 0) {
words.addAll(lines);
}
}
return words;
}
/**
* 从文件中加载同义标签
* 文件中每行表示一组同义词,同一组同义词之间用竖线(|)隔开,如:
* 男款|男式|男士
* 女款|女式|女士
* @return java.util.Set<java.util.Set<java.lang.String>>
* @date 2022/8/8
*/
public static Set<Set<String>> loadSynonymTags() {
String synonymTagPath = ConfigUtil.getString(synonymTagFile);
LOG.info("debugLog synonymTagFile path is " + synonymTagPath);
Set<Set<String>> synonyms = new HashSet<>();
if (StringUtils.isNotBlank(synonymTagPath)) {
List<String> lines = FileUtils.readLines(synonymTagPath);
lines.stream().filter(line -> StringUtils.isNotBlank(line)).forEach(line -> {
String[] arrWords = line.split("\\|");
if (arrWords.length > 0) {
Set<String> words = new HashSet<>(Arrays.asList(arrWords));
synonyms.add(words);
}
});
}
return synonyms;
}
}
package com.secoo.so.suggest.task; package com.secoo.so.suggest.task;
import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import com.secoo.so.suggest.config.ConfigUtil; import com.secoo.so.suggest.config.ConfigUtil;
import com.secoo.so.suggest.db.DwDataSource; import com.secoo.so.suggest.db.DwDataSource;
import com.secoo.so.suggest.db.ErpDataSource; import com.secoo.so.suggest.db.ErpDataSource;
...@@ -10,6 +11,8 @@ import com.secoo.so.suggest.entity.EsSuggestKeywordInfo; ...@@ -10,6 +11,8 @@ import com.secoo.so.suggest.entity.EsSuggestKeywordInfo;
import com.secoo.so.suggest.entity.SearchKeywordInfo; import com.secoo.so.suggest.entity.SearchKeywordInfo;
import com.secoo.so.suggest.es.EsClient; import com.secoo.so.suggest.es.EsClient;
import com.secoo.so.suggest.es.EsObject; import com.secoo.so.suggest.es.EsObject;
import com.secoo.so.suggest.helper.QueryPlanHelper;
import com.secoo.so.suggest.helper.WordHelper;
import com.secoo.so.suggest.util.*; import com.secoo.so.suggest.util.*;
import lombok.Data; import lombok.Data;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
...@@ -36,6 +39,10 @@ public class SuggestTask { ...@@ -36,6 +39,10 @@ public class SuggestTask {
private static int maxTagSize = 5; private static int maxTagSize = 5;
private static long startTime = System.currentTimeMillis(); private static long startTime = System.currentTimeMillis();
private static Set<String> spWordSet = new HashSet<>();
private static Set<Set<String>> synonymList = new HashSet<>();
private static Set<String> ignoreWordSet = new HashSet<>(Arrays.asList("系列","型号","款式","风格"));
public static void main(String[] args) { public static void main(String[] args) {
startTime = System.currentTimeMillis(); startTime = System.currentTimeMillis();
log.info(">>>>>>>>>>>> start run SuggestTask , startTime: " + startTime); log.info(">>>>>>>>>>>> start run SuggestTask , startTime: " + startTime);
...@@ -55,10 +62,19 @@ public class SuggestTask { ...@@ -55,10 +62,19 @@ public class SuggestTask {
sensitiveMap = loadSensitiveMap(); sensitiveMap = loadSensitiveMap();
europeWordMap = loadEuropeWordMap(); europeWordMap = loadEuropeWordMap();
// 加载部分确定不能分割的特殊词
spWordSet = loadSpecialWords();
// 加载标签同义词
synonymList = loadTagSynonym();
QueryPlanHelper sqp = QueryPlanHelper.getInstance();
// 加载搜索词并处理 // 加载搜索词并处理
processSuggestTask(startTime); processSuggestTask(startTime);
log.info("<<<<<<<<<<<< end run SuggestTask, startTime: {} , cost: {}ms", startTime, (System.currentTimeMillis() - startTime) ); log.info("<<<<<<<<<<<< end run SuggestTask, startTime: {} , cost: {}ms", startTime, (System.currentTimeMillis() - startTime) );
System.exit(0);
} }
private static Map<String, Long> loadBrandMap() { private static Map<String, Long> loadBrandMap() {
...@@ -179,6 +195,33 @@ public class SuggestTask { ...@@ -179,6 +195,33 @@ public class SuggestTask {
return prefixFilterList; return prefixFilterList;
} }
private static Set<Set<String>> loadTagSynonym(){
Set<Set<String>> synSet = new HashSet<>();
Set<String> maleWords = new HashSet<>(Arrays.asList("男性","男式","男士","男款","男"));
Set<String> femaleWords = new HashSet<>(Arrays.asList("女性","女式","女士","女款","女"));
synSet.add(maleWords);
synSet.add(femaleWords);
Set<Set<String>> fileSynonyms = WordHelper.loadSynonymTags();
if (fileSynonyms.size() > 0) {
synSet.addAll(fileSynonyms);
}
log.info("debugLog synonym count:" + synSet.size());
return synSet;
}
private static Set<String> loadSpecialWords(){
Set<String> baseSet = new HashSet<>(Arrays.asList(
"靴子","鞋子","裤子","袜子","裙子","帽子","杯子","箱子","包包","包袋","包带","表带",
"大号","中号","小号","衣服","t恤","衣服","男款","男士","男式","男性","男童","女款",
"女士","女式","女性","女童","大象","男包","女包","男鞋","女鞋"));
Set<String> fileWords = WordHelper.loadSpecialWords();
if (fileWords.size() > 0) {
baseSet.addAll(fileWords);
}
log.info("debugLog specialWords count:" + baseSet.size());
return baseSet;
}
private static String cleanKeyword(String keyword) { private static String cleanKeyword(String keyword) {
if (keyword != null) { if (keyword != null) {
String fixKeyword = PinYinUtils.convertToSimplifiedChinese(keyword); String fixKeyword = PinYinUtils.convertToSimplifiedChinese(keyword);
...@@ -244,6 +287,8 @@ public class SuggestTask { ...@@ -244,6 +287,8 @@ public class SuggestTask {
} }
} }
ConcurrentHashMap<String, EsSuggestKeywordInfo> tmpSuggestKeywordMap = new ConcurrentHashMap<>();
// 过滤词 // 过滤词
List<EsSuggestKeywordInfo> suggestKeywordInfoList = new ArrayList<>(); List<EsSuggestKeywordInfo> suggestKeywordInfoList = new ArrayList<>();
int processCount = 0; int processCount = 0;
...@@ -261,6 +306,7 @@ public class SuggestTask { ...@@ -261,6 +306,7 @@ public class SuggestTask {
cleanBeforeSaveToEs(suggestKeywordInfo); cleanBeforeSaveToEs(suggestKeywordInfo);
suggestKeywordInfoList.add(suggestKeywordInfo); suggestKeywordInfoList.add(suggestKeywordInfo);
tmpSuggestKeywordMap.put(suggestKeywordInfo.getKeyword(), suggestKeywordInfo);
} }
processCount++; processCount++;
...@@ -268,6 +314,8 @@ public class SuggestTask { ...@@ -268,6 +314,8 @@ public class SuggestTask {
log.info("keyword filter process: {} / {}", processCount, totalCount); log.info("keyword filter process: {} / {}", processCount, totalCount);
} }
} }
mergeKeywordTag(tmpSuggestKeywordMap); // 处理部分keyword,合并为其他词的tag
QueryPlanHelper.getInstance().writeQueryPlanToFile();
if ("true".equalsIgnoreCase(System.getProperty("suggest.saveToFile"))) { if ("true".equalsIgnoreCase(System.getProperty("suggest.saveToFile"))) {
// save to file // save to file
...@@ -278,6 +326,162 @@ public class SuggestTask { ...@@ -278,6 +326,162 @@ public class SuggestTask {
} }
} }
private static void mergeKeywordTag(ConcurrentHashMap<String, EsSuggestKeywordInfo> esSuggestKeywordMap) {
int maxCount = 10000;
List<String> keyList = new ArrayList<>(esSuggestKeywordMap.keySet());
Collections.sort(keyList); // 按照字符排序,确保扩展词都在本词后面
for(int i = 0;i< keyList.size(); i++) {
String word = keyList.get(i);
EsSuggestKeywordInfo suggestInfo = esSuggestKeywordMap.get(word);
if (suggestInfo == null) {
continue;
}
int wordLen = StringUtils.getByteLength(word);
int length = word.length();
if (wordLen > 3 && wordLen <= 18 && StringUtils.isBlank(suggestInfo.getSuggestTags())) {
List<EsSuggestKeywordInfo> suggestList = new ArrayList<>();
int keyCount = 0;
// 获取包含word的suggest,满足条件的存入suggestList列表,限定最多100个词
for (int j = i + 1; j < keyList.size(); j++) {
if (keyCount > maxCount) {
break;
}
String fullWord = keyList.get(j);
EsSuggestKeywordInfo tmpSuggest = esSuggestKeywordMap.get(fullWord);
if (StringUtils.isNotBlank(tmpSuggest.getSuggestTags())) {
continue;
}
int rightLen = StringUtils.getByteLength(fullWord) - wordLen;
if (fullWord.startsWith(word)) {
if (rightLen > 3 && rightLen <= 14) {
if (isSkipMergeTag(word, fullWord)) {
continue;
}
suggestList.add(esSuggestKeywordMap.get(fullWord));
keyCount++;
}
} else {
break;
}
}
// 对suggestList按照评分排序,取前3个合并在一起做为word的tag
if (suggestList.size() > 0) {
Collections.sort(suggestList, (t1, t2) -> {
Double score1 = t1.getWordABRank();
Double score2 = t2.getWordABRank();
if (score1 != null && score2 != null) {
return score2.compareTo(score1);
} else if (score1 == null && score2 == null) {
return 0;
} else if (score1 == null) {
return 1;
} else {
return -1;
}
});
Set<String> tagSet = new HashSet<>();
int count=0;
for (int k=0; k < suggestList.size(); k++) {
EsSuggestKeywordInfo info = suggestList.get(k);
String fullWord = info.getKeyword();
int fulLen = fullWord.length();
String subWord = fullWord.substring(length, fulLen).trim();
if(count < 3 && !isSkipSynonymTag(subWord, tagSet)) {
tagSet.add(subWord);
count++;
}
}
if (tagSet.size() > 0) {
String tags = String.join(",", tagSet);
suggestInfo.setSuggestTags(tags);
}
}
}
}
}
private static boolean isSkipSynonymTag(String subWord, Set<String> tagSet) {
if (StringUtils.isBlank(subWord)) {
return true;
}
if (synonymList.size() > 0) {
for (Set<String> synonymSet : synonymList) {
if (synonymSet.contains(subWord)) {
// 同义词不支持多组中有交叉内容,一旦发现一组,就会跳出循环
for (String synonym : synonymSet) {
if (tagSet.contains(synonym)) {
return true;
}
}
break;
}
}
}
return false;
}
private static boolean isSkipMergeTag(String word, String fullWord) {
if (StringUtils.isBlank(word) || StringUtils.isBlank(fullWord) ) {
return true;
}
int length = word.length();
int fullLen = fullWord.length();
if (fullLen <= length) {
return true;
}
boolean isWordEnStr = StringUtils.isEnAndMidSpaceStr(word);
int wordByteLen = StringUtils.getByteLength(word);
if (wordByteLen <= 3 || wordByteLen > 18) {
return true;
}
String leftSubChar = word.substring(length - 1, length);
String rightSubChar = fullWord.substring(length, length + 1);
String rightWord = fullWord.substring(length, fullLen).trim();
int realByteLen = StringUtils.getByteLength(rightWord);
if (realByteLen <= 3 || realByteLen > 14) {
// 限制作为标签的长度
return true;
}
if (ignoreWordSet.contains(rightWord)) {
return true;
}
boolean isTShirt = false;
if (rightWord.length() >= 2 && rightWord.toLowerCase().startsWith("t恤")) {
isTShirt = true;
}
if (isWordEnStr && (StringUtils.isEnStr(rightSubChar) && !isTShirt)) {
// 左边是英文和右边词语也是英文开头,认为是同一个英文词,但是T恤除外
return true;
}
if (StringUtils.isNumber(leftSubChar + rightSubChar)) {
// 左右连接处都是数字时,不做tag
return true;
}
if (rightWord.startsWith("色")) {
// 通常前面带有颜色词,分开的不做标签
return true;
}
if (spWordSet.contains(leftSubChar + rightSubChar)) {
// 左边结尾和右边开头组成常见词语,不做标签
return true;
}
if (StringUtils.isNotBlank(rightWord)) {
log.info("check word:" + word + " and " + fullWord);
QueryPlanHelper sqp = QueryPlanHelper.getInstance();
int wordCount1 = sqp.explainQueryWordCount(word);
int wordCount2 = sqp.explainQueryWordCount(rightWord);
int wordCount3 = sqp.explainQueryWordCount(fullWord);
if (wordCount1 + wordCount2 > wordCount3) {
return true;
}
} else {
return true;
}
return false;
}
private static EsSuggestKeywordInfo buildDefaultEsSuggestKeywordInfo(String keyword) { private static EsSuggestKeywordInfo buildDefaultEsSuggestKeywordInfo(String keyword) {
EsSuggestKeywordInfo esSuggestKeywordInfo = new EsSuggestKeywordInfo(); EsSuggestKeywordInfo esSuggestKeywordInfo = new EsSuggestKeywordInfo();
esSuggestKeywordInfo.setKeyword(keyword); esSuggestKeywordInfo.setKeyword(keyword);
......
...@@ -6,6 +6,7 @@ import com.alibaba.fastjson.JSONObject; ...@@ -6,6 +6,7 @@ import com.alibaba.fastjson.JSONObject;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.math.BigDecimal; import java.math.BigDecimal;
import java.nio.channels.Pipe;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.security.MessageDigest; import java.security.MessageDigest;
...@@ -2067,6 +2068,17 @@ public abstract class StringUtils { ...@@ -2067,6 +2068,17 @@ public abstract class StringUtils {
} }
/** /**
* 判断是否是英文字符串,两边有空格认为不是英文,包含在中间的空格认为是英文
*/
public static boolean isEnAndMidSpaceStr(String word) {
if (word.startsWith(" ") || word.endsWith(" ")) {
return false;
}
boolean result = word.trim().matches("[a-zA-Z\\s]+");
return result;
}
/**
* 判断是否包含中文 * 判断是否包含中文
*/ */
public static boolean isContainChStr(String word) { public static boolean isContainChStr(String word) {
...@@ -2109,6 +2121,20 @@ public abstract class StringUtils { ...@@ -2109,6 +2121,20 @@ public abstract class StringUtils {
return 0; return 0;
} }
public static void main(String[] arg) {
Set<String> spWordSet = new HashSet<>(Arrays.asList("靴子","鞋子","裤子","袜子","裙子","帽子","杯子","箱子","包包","包袋","包袋"));
String word = "ab c ";
String word1 = "ab c 中文";
int wordLen = StringUtils.getByteLength(word);
int len = StringUtils.getByteLength(word1);
String aaa = word1.substring(wordLen, wordLen+1);
String bbb = word1.substring(wordLen, word1.length());
System.out.println(isEnAndMidSpaceStr(word));
System.out.println(isEnAndMidSpaceStr(bbb));
System.out.println(aaa);
System.out.println(bbb);
}
/** /**
* 32位md5加密 * 32位md5加密
*/ */
...@@ -2153,7 +2179,7 @@ public abstract class StringUtils { ...@@ -2153,7 +2179,7 @@ public abstract class StringUtils {
* *
* <p>If the stripChars String is {@code null}, whitespace is * <p>If the stripChars String is {@code null}, whitespace is
* stripped as defined by {@link Character#isWhitespace(char)}. * stripped as defined by {@link Character#isWhitespace(char)}.
* Alternatively use {@link #strip(String)}.</p> * Alternatively use {@link #strip(String, String)}.</p>
* *
* <pre> * <pre>
* StringUtils.strip(null, *) = null * StringUtils.strip(null, *) = null
......
...@@ -14,3 +14,7 @@ suggestTask.es.password=search5z0NvEn1D ...@@ -14,3 +14,7 @@ suggestTask.es.password=search5z0NvEn1D
suggestTask.es.index=search_suggest_index suggestTask.es.index=search_suggest_index
suggestTask.es.type=search_suggest_type suggestTask.es.type=search_suggest_type
suggestTask.es.batchSize=2000 suggestTask.es.batchSize=2000
queryPlan.cachePath=/data/crontab/suggest/tmp/queryplan.txt
specialWordPath=/data/crontab/suggest/dict/specialWord.txt
synonymTagPath=/data/crontab/suggest/dict/synonymTag.txt
...@@ -5,11 +5,16 @@ suggestTask.SensitiveFolder=/data/pssmaster/corpus_set/suggest_corpus/sensitive ...@@ -5,11 +5,16 @@ suggestTask.SensitiveFolder=/data/pssmaster/corpus_set/suggest_corpus/sensitive
suggestTask.EuropeWordFolder=/data/pssmaster/corpus_set/suggest_corpus/europe_word suggestTask.EuropeWordFolder=/data/pssmaster/corpus_set/suggest_corpus/europe_word
suggestTask.batchSize=10000 suggestTask.batchSize=10000
suggestTask.threadPoolSize=10 suggestTask.threadPoolSize=10
suggestTask.suggestTagMaxSize=5
suggestTask.searchWordWarningCount=1000000 suggestTask.searchWordWarningCount=1000000
suggestTask.es.url=http://10.0.254.139:9200 suggestTask.suggestTagMaxSize=5
suggestTask.es.user=suggest suggestTask.warningPhones=13426233960
suggestTask.es.password=suggest456 suggestTask.es.url=http://bigdataescluster.secoolocal.com:9200
suggestTask.es.index=search_suggest_index suggestTask.es.user=search
suggestTask.es.password=search5z0NvEn1D
suggestTask.es.index=search_suggest_index_huidu
suggestTask.es.type=search_suggest_type suggestTask.es.type=search_suggest_type
suggestTask.es.batchSize=2000 suggestTask.es.batchSize=2000
queryPlan.cachePath=/data/crontab/test/tmp/queryplan.txt
specialWordPath=/data/crontab/test/dict/specialWord.txt
synonymTagPath=/data/crontab/test/dict/synonymTag.txt
\ No newline at end of file
erp.read.url=jdbc:mysql://10.4.3.223:3306/secooErpDB?useUnicode=true&amp;characterEncoding=utf8&amp;noAccessToProcedureBodies=true&amp;zeroDateTimeBehavior=convertToNull&amp;allowMultiQueries=true erp.read.url=jdbc:mysql://192.168.50.40:3306/secooErpDB?useUnicode=true&amp;characterEncoding=utf8&amp;noAccessToProcedureBodies=true&amp;zeroDateTimeBehavior=convertToNull&amp;allowMultiQueries=true
erp.read.user=3306_test erp.read.user=so_Erp_R
erp.read.password=iS6CXpYqgZ8Mhjui erp.read.password=5RgzudyyFlApTmve
seo.read.url=jdbc:mysql://10.4.3.223:3306/secooSeoDB?useUnicode=true&amp;characterEncoding=utf8&amp;zeroDateTimeBehavior=convertToNull seo.read.url=jdbc:mysql://secooSeoDB.master.com:3307/secooSeoDB?useUnicode=true&amp;characterEncoding=utf8&amp;zeroDateTimeBehavior=convertToNull
seo.read.user=SeoDB_test seo.read.user=sem_Seo_W
seo.read.password=Cxkfq57huej0fTpK seo.read.password=C2IiHfNKYpT1onsR
\ No newline at end of file
dw.read.url=jdbc:mysql://secooDataWarehouse.slave.com:3306/secooDataWarehouse?useUnicode=true&amp;characterEncoding=utf8&amp;zeroDateTimeBehavior=convertToNull
dw.read.user=Search_DataWar_R
dw.read.password=pY1P9zUj9x1M65ot5szo
\ No newline at end of file
手提
手提
提包
\ No newline at end of file
皮夹|钱包
皮夹|钱包
围脖|围巾
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment