Commit 099f58c5 by 王明范

Merge branch 'TECH-SEARCH-merge-keyword-tag' into 'master'

Tech search merge keyword tag

See merge request !4
parents c31088f6 70d2074b
...@@ -36,7 +36,41 @@ ...@@ -36,7 +36,41 @@
</profiles> </profiles>
<dependencies> <dependencies>
<dependency>
<groupId>com.secoo</groupId>
<artifactId>sqp4j-client</artifactId>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
<exclusion>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</exclusion>
<exclusion>
<artifactId>secoo-log</artifactId>
<groupId>com.secoo.mall</groupId>
</exclusion>
</exclusions>
<version>2.9.6.RELEASE</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>dubbo</artifactId>
<version>2.6.0</version>
<exclusions>
<exclusion>
<artifactId>spring-context</artifactId>
<groupId>org.springframework</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>com.github.sgroschupf</groupId>
<artifactId>zkclient</artifactId>
<version>0.1</version>
</dependency>
<dependency> <dependency>
<groupId>mysql</groupId> <groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId> <artifactId>mysql-connector-java</artifactId>
......
package com.secoo.so.suggest.client;
import com.alibaba.dubbo.config.ApplicationConfig;
import com.alibaba.dubbo.config.ReferenceConfig;
import com.alibaba.dubbo.config.RegistryConfig;
import com.secoo.search.sqp4j.QueryPlan;
import org.apache.log4j.Logger;
import java.util.Map;
/**
* @author wangmingfan
* @date 2020/8/17
* @description sqp dubbo client
*/
public class SqpDubboClient {
private static final Logger loger = Logger.getLogger(SqpDubboClient.class);
private static ReferenceConfig<QueryPlan> dubboSqpReferenceConfigProd = null;
private static ReferenceConfig<QueryPlan> dubboSqpReferenceConfigUat = null;
private static ReferenceConfig<QueryPlan> dubboSqpReferenceConfigTest = null;
public static QueryPlan getDirectImpl(String url, Map<String, String> map){
ReferenceConfig<QueryPlan> impl = directDubboSqpReferenceConfig(url);
QueryPlan dubboSqp = impl.get();
map.put("Client", impl.getClient());
map.put("Interface", impl.getInterface());
map.put("Protocol", impl.getProtocol());
map.put("Url", impl.getUrl());
map.put("Cluster", impl.getCluster());
return dubboSqp;
}
public static QueryPlan getTestImpl(){
ReferenceConfig<QueryPlan> impl = getTestDubboSqpReferenceConfig();
//impl.setGroup(""); //测试test环境未设置group
QueryPlan dubboSqp = impl.get();
return dubboSqp;
}
public static QueryPlan getUatImpl() {
ReferenceConfig<QueryPlan> impl = getDubboSqpReferenceConfigUat();
QueryPlan dubboSqp = impl.get();
return dubboSqp;
}
public static QueryPlan getProdImpl() {
ReferenceConfig<QueryPlan> impl = getDubboSqpReferenceConfigProd();
QueryPlan dubboSqp = impl.get();
return dubboSqp;
}
private static ReferenceConfig<QueryPlan> getDubboSqpReferenceConfigProd() {
if (dubboSqpReferenceConfigProd == null) {
dubboSqpReferenceConfigProd = buildDubboSqpReferenceConfig();
}
return dubboSqpReferenceConfigProd;
}
private static ReferenceConfig<QueryPlan> getDubboSqpReferenceConfigUat() {
if (dubboSqpReferenceConfigUat == null) {
dubboSqpReferenceConfigUat = buildDubboSqpReferenceConfig();
dubboSqpReferenceConfigUat.setGroup("grey");
}
return dubboSqpReferenceConfigUat;
}
private static ReferenceConfig<QueryPlan> buildDubboSqpReferenceConfig() {
ReferenceConfig<QueryPlan> impl = new ReferenceConfig<QueryPlan>();
impl.setProtocol("dubbo");
impl.setApplication(new ApplicationConfig("sem-test-tool"));
RegistryConfig registryConfig = new RegistryConfig("zookeeper://zk-mall1.secoolocal.com:5181?backup=zk-mall2.secoolocal.com:5181,zk-mall3.secoolocal.com:5181");
registryConfig.setProtocol("zookeeper");
registryConfig.setClient("zkclient");
impl.setRegistry(registryConfig);
// impl.setMonitor(new MonitorConfig() { { setProtocol("registry"); } });
impl.setVersion("1.0.0");
impl.setInterface("com.secoo.search.sqp4j.QueryPlan");
return impl;
}
/**
* 连接测试环境dubbo
* @return com.alibaba.dubbo.config.ReferenceConfig<com.secoo.search.sqp4j.QueryPlan>
* @author wangmingfan
* @date 2020/8/17
*/
private static ReferenceConfig<QueryPlan> getTestDubboSqpReferenceConfig() {
if (dubboSqpReferenceConfigTest == null) {
ReferenceConfig<QueryPlan> impl = new ReferenceConfig<QueryPlan>();
impl.setProtocol("dubbo");
impl.setApplication(new ApplicationConfig("sem-test-tool"));
RegistryConfig registryConfig = new RegistryConfig("zookeeper://10.185.240.81:2181?backup=10.185.240.82:2181,10.185.240.83:2181");
registryConfig.setProtocol("zookeeper");
registryConfig.setClient("zkclient");
impl.setRegistry(registryConfig);
// impl.setMonitor(new MonitorConfig() { { setProtocol("registry"); } });
impl.setVersion("1.0.0");
impl.setInterface("com.secoo.search.sqp4j.QueryPlan");
dubboSqpReferenceConfigTest = impl;
}
return dubboSqpReferenceConfigTest;
}
/**
* 直连dubbo服务
* @param Url url格式:ip:端口号
* @return com.alibaba.dubbo.config.ReferenceConfig<com.secoo.search.sqp4j.QueryPlan>
* @author wangmingfan
* @date 2020/4/6
*/
private static ReferenceConfig<QueryPlan> directDubboSqpReferenceConfig(String Url) {
ReferenceConfig<QueryPlan> impl = new ReferenceConfig<QueryPlan>();
impl.setProtocol("dubbo");
impl.setApplication(new ApplicationConfig("sem-test-tool"));
impl.setUrl(Url); //"10.185.240.158:20062"
//impl.setMonitor(new MonitorConfig() { { setProtocol("registry"); } });
impl.setVersion("1.0.0");
impl.setInterface("com.secoo.search.sqp4j.QueryPlan");
return impl;
}
}
package com.secoo.so.suggest.helper;
import com.secoo.abtest.common.Buckets;
import com.secoo.search.sqp4j.Explanation;
import com.secoo.search.sqp4j.Explanations;
import com.secoo.search.sqp4j.QueryPlan;
import com.secoo.search.sqp4j.QueryWord;
import com.secoo.search.sqp4j.client.QueryPlanClient;
import com.secoo.so.suggest.client.SqpDubboClient;
import com.secoo.so.suggest.config.ConfigUtil;
import com.secoo.so.suggest.util.FileUtils;
import com.secoo.so.suggest.util.StringUtils;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.*;
/**
* @author wangmingfan
* @date 2022/8/4
* @description
*/
public class QueryPlanHelper {
private static final Logger LOG = LoggerFactory.getLogger(QueryPlanHelper.class);
private static volatile QueryPlanHelper instance;
private static QueryPlan client = null;
Map<String, Explanation> sqpCache = new HashMap<>();
List<String> wordList = new ArrayList<>();
Map<String, Integer> keywordMap = new HashMap<>();
private static String queryPlanFile = ""; ///data/crontab/test/tmp/queryplan.txt
private static List<String> newWordLines = new ArrayList<>();
private static long minTimeStamp = Long.MAX_VALUE;
private QueryPlanHelper() {
queryPlanFile = ConfigUtil.getString("queryPlan.cachePath", "");
LOG.info("debugLog queryPlanFile path is " + queryPlanFile);
client = SqpDubboClient.getProdImpl();
loadQueryPlanFromFile();
}
public static QueryPlanHelper getInstance() {
if (instance == null) {
synchronized (QueryPlanHelper.class) {
if (instance == null) {
instance = new QueryPlanHelper();
}
}
}
return instance;
}
public int explainQueryWordCount(String keyword) {
int count = 0;
if (StringUtils.isNotBlank(keyword)) {
if (keywordMap.containsKey(keyword)) {
return keywordMap.get(keyword);
}
String traceId = UUID.randomUUID().toString();
Map<String, String> bucketInfo = new HashMap<>();
Buckets bucket = new Buckets(bucketInfo);
String cityCode = "";
long currDate = 0L;
int needSpell = 0;
Explanations explanations = client.explain(traceId, bucket, cityCode, currDate, needSpell, keyword, null);
if (explanations != null && explanations.getItems().size() > 0 && explanations.getItems().get(0) != null) {
Explanation explanation = explanations.getItems().get(0);
if (explanation.getQueryWords() != null) {
int wordCount = explanation.getQueryWords().size();
keywordMap.put(keyword, wordCount);
newWordLines.add(keyword + "," + wordCount + "," + (System.currentTimeMillis() /1000) );
return wordCount;
}
}
}
return count;
}
private void loadQueryPlanFromFile() {
if (StringUtils.isNotBlank(queryPlanFile)) {
List<String> lines = FileUtils.readLines(queryPlanFile);
if (lines != null && lines.size() > 0) {
for (String line : lines) {
if (StringUtils.isBlank(line)) {
continue;
}
String[] arr = line.split(",");
if (arr.length == 3) {
String keyword = arr[0];
String strWordCount = arr[1];
String ts = arr[2];
if (StringUtils.isNotBlank(keyword) && StringUtils.isNumber(strWordCount) && StringUtils.isNumber(ts)) {
try {
long timeStamp = Long.valueOf(ts);
if (timeStamp < minTimeStamp) {
minTimeStamp = timeStamp;
}
int wordCount = Integer.valueOf(strWordCount);
keywordMap.put(keyword, wordCount);
} catch (Exception e) {
LOG.info("string to integer exception,", e);
}
}
}
}
}
}
}
public void writeQueryPlanToFile() {
if (StringUtils.isBlank(queryPlanFile)) {
return;
}
long nowSecond = System.currentTimeMillis()/1000;
long sevenDays = 3600 * 24 * 7;
if (nowSecond - minTimeStamp > sevenDays) {
// 文件中最早的时间戳超过7天,全量更新;否则只更新新增的
if (keywordMap.size() > 0) {
newWordLines = new ArrayList<>(); // map转存到newWordLines
for (Map.Entry<String, Integer> entry : keywordMap.entrySet()) {
String line = entry.getKey() + "," + entry.getValue() + "," + nowSecond;
newWordLines.add(line);
}
}
}
if (newWordLines != null && newWordLines.size() > 0) {
FileUtils.saveToFile(newWordLines, queryPlanFile, true);
newWordLines = new ArrayList<>();
}
}
public Explanation explain(String keyword) {
if (StringUtils.isNotBlank(keyword)) {
if (sqpCache.containsKey(keyword)) {
return sqpCache.get(keyword);
}
String traceId = UUID.randomUUID().toString();
Map<String, String> bucketInfo = new HashMap<>();
Buckets bucket = new Buckets(bucketInfo);
String cityCode = "";
long currDate = 0L;
int needSpell = 0;
Explanations explanations = client.explain(traceId, bucket, cityCode, currDate, needSpell, keyword, null);
if (explanations != null && explanations.getItems().size() > 0 && explanations.getItems().get(0) != null) {
Explanation explanation = explanations.getItems().get(0);
cacheKeyword(keyword, explanation);
return explanation;
}
}
return null;
}
private void cacheKeyword(String keyword, Explanation explanation) {
sqpCache.put(keyword, explanation);
wordList.add(keyword);
if (sqpCache.size() > 100000) {
String removeWord = wordList.get(0);
wordList.remove(0);
sqpCache.remove(removeWord);
}
}
public static void main(String[] arg) {
Map<String, String> cache = new HashMap<>();
List<String> list = new ArrayList<>();
for(int i=0; i< 7;i++) {
cache.put("key_" + i, String.valueOf(i));
list.add("key_" + i);
if (cache.size()>5) {
String rk = list.get(0);
list.remove(0);
cache.remove(rk);
}
System.out.println("list size:"+list.size()+";map size:" + cache.size());
}
}
}
package com.secoo.so.suggest.helper;
import com.secoo.so.suggest.config.ConfigUtil;
import com.secoo.so.suggest.util.FileUtils;
import com.secoo.so.suggest.util.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/**
* @author wangmingfan
* @date 2022/8/8
* @description
*/
public class WordHelper {
private static final Logger LOG = LoggerFactory.getLogger(WordHelper.class);
private static final String specialWordFile = "specialWordPath";
private static final String synonymTagFile = "synonymTagPath";
/**
* 从文件中加载部分特定不进行分割的词
* 文件中每行一个词语
* @return java.util.Set<java.lang.String>
* @date 2022/8/8
*/
public static Set<String> loadSpecialWords() {
String specialWordPath = ConfigUtil.getString(specialWordFile);
LOG.info("debugLog specialWordFile path is " + specialWordPath);
Set<String> words = new HashSet<>();
if (StringUtils.isNotBlank(specialWordPath)) {
List<String> lines = FileUtils.readLines(specialWordPath);
if (lines != null && lines.size() > 0) {
words.addAll(lines);
}
}
return words;
}
/**
* 从文件中加载同义标签
* 文件中每行表示一组同义词,同一组同义词之间用竖线(|)隔开,如:
* 男款|男式|男士
* 女款|女式|女士
* @return java.util.Set<java.util.Set<java.lang.String>>
* @date 2022/8/8
*/
public static Set<Set<String>> loadSynonymTags() {
String synonymTagPath = ConfigUtil.getString(synonymTagFile);
LOG.info("debugLog synonymTagFile path is " + synonymTagPath);
Set<Set<String>> synonyms = new HashSet<>();
if (StringUtils.isNotBlank(synonymTagPath)) {
List<String> lines = FileUtils.readLines(synonymTagPath);
lines.stream().filter(line -> StringUtils.isNotBlank(line)).forEach(line -> {
String[] arrWords = line.split("\\|");
if (arrWords.length > 0) {
Set<String> words = new HashSet<>(Arrays.asList(arrWords));
synonyms.add(words);
}
});
}
return synonyms;
}
}
...@@ -6,6 +6,7 @@ import com.alibaba.fastjson.JSONObject; ...@@ -6,6 +6,7 @@ import com.alibaba.fastjson.JSONObject;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.math.BigDecimal; import java.math.BigDecimal;
import java.nio.channels.Pipe;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.security.MessageDigest; import java.security.MessageDigest;
...@@ -2067,6 +2068,17 @@ public abstract class StringUtils { ...@@ -2067,6 +2068,17 @@ public abstract class StringUtils {
} }
/** /**
* 判断是否是英文字符串,两边有空格认为不是英文,包含在中间的空格认为是英文
*/
public static boolean isEnAndMidSpaceStr(String word) {
if (word.startsWith(" ") || word.endsWith(" ")) {
return false;
}
boolean result = word.trim().matches("[a-zA-Z\\s]+");
return result;
}
/**
* 判断是否包含中文 * 判断是否包含中文
*/ */
public static boolean isContainChStr(String word) { public static boolean isContainChStr(String word) {
...@@ -2109,6 +2121,20 @@ public abstract class StringUtils { ...@@ -2109,6 +2121,20 @@ public abstract class StringUtils {
return 0; return 0;
} }
public static void main(String[] arg) {
Set<String> spWordSet = new HashSet<>(Arrays.asList("靴子","鞋子","裤子","袜子","裙子","帽子","杯子","箱子","包包","包袋","包袋"));
String word = "ab c ";
String word1 = "ab c 中文";
int wordLen = StringUtils.getByteLength(word);
int len = StringUtils.getByteLength(word1);
String aaa = word1.substring(wordLen, wordLen+1);
String bbb = word1.substring(wordLen, word1.length());
System.out.println(isEnAndMidSpaceStr(word));
System.out.println(isEnAndMidSpaceStr(bbb));
System.out.println(aaa);
System.out.println(bbb);
}
/** /**
* 32位md5加密 * 32位md5加密
*/ */
...@@ -2153,7 +2179,7 @@ public abstract class StringUtils { ...@@ -2153,7 +2179,7 @@ public abstract class StringUtils {
* *
* <p>If the stripChars String is {@code null}, whitespace is * <p>If the stripChars String is {@code null}, whitespace is
* stripped as defined by {@link Character#isWhitespace(char)}. * stripped as defined by {@link Character#isWhitespace(char)}.
* Alternatively use {@link #strip(String)}.</p> * Alternatively use {@link #strip(String, String)}.</p>
* *
* <pre> * <pre>
* StringUtils.strip(null, *) = null * StringUtils.strip(null, *) = null
......
...@@ -14,3 +14,7 @@ suggestTask.es.password=search5z0NvEn1D ...@@ -14,3 +14,7 @@ suggestTask.es.password=search5z0NvEn1D
suggestTask.es.index=search_suggest_index suggestTask.es.index=search_suggest_index
suggestTask.es.type=search_suggest_type suggestTask.es.type=search_suggest_type
suggestTask.es.batchSize=2000 suggestTask.es.batchSize=2000
queryPlan.cachePath=/data/crontab/suggest/tmp/queryplan.txt
specialWordPath=/data/crontab/suggest/dict/specialWord.txt
synonymTagPath=/data/crontab/suggest/dict/synonymTag.txt
...@@ -5,11 +5,16 @@ suggestTask.SensitiveFolder=/data/pssmaster/corpus_set/suggest_corpus/sensitive ...@@ -5,11 +5,16 @@ suggestTask.SensitiveFolder=/data/pssmaster/corpus_set/suggest_corpus/sensitive
suggestTask.EuropeWordFolder=/data/pssmaster/corpus_set/suggest_corpus/europe_word suggestTask.EuropeWordFolder=/data/pssmaster/corpus_set/suggest_corpus/europe_word
suggestTask.batchSize=10000 suggestTask.batchSize=10000
suggestTask.threadPoolSize=10 suggestTask.threadPoolSize=10
suggestTask.suggestTagMaxSize=5
suggestTask.searchWordWarningCount=1000000 suggestTask.searchWordWarningCount=1000000
suggestTask.es.url=http://10.0.254.139:9200 suggestTask.suggestTagMaxSize=5
suggestTask.es.user=suggest suggestTask.warningPhones=13426233960
suggestTask.es.password=suggest456 suggestTask.es.url=http://bigdataescluster.secoolocal.com:9200
suggestTask.es.index=search_suggest_index suggestTask.es.user=search
suggestTask.es.password=search5z0NvEn1D
suggestTask.es.index=search_suggest_index_huidu
suggestTask.es.type=search_suggest_type suggestTask.es.type=search_suggest_type
suggestTask.es.batchSize=2000 suggestTask.es.batchSize=2000
\ No newline at end of file
queryPlan.cachePath=/data/crontab/test/tmp/queryplan.txt
specialWordPath=/data/crontab/test/dict/specialWord.txt
synonymTagPath=/data/crontab/test/dict/synonymTag.txt
\ No newline at end of file
erp.read.url=jdbc:mysql://10.4.3.223:3306/secooErpDB?useUnicode=true&amp;characterEncoding=utf8&amp;noAccessToProcedureBodies=true&amp;zeroDateTimeBehavior=convertToNull&amp;allowMultiQueries=true erp.read.url=jdbc:mysql://192.168.50.40:3306/secooErpDB?useUnicode=true&amp;characterEncoding=utf8&amp;noAccessToProcedureBodies=true&amp;zeroDateTimeBehavior=convertToNull&amp;allowMultiQueries=true
erp.read.user=3306_test erp.read.user=so_Erp_R
erp.read.password=iS6CXpYqgZ8Mhjui erp.read.password=5RgzudyyFlApTmve
seo.read.url=jdbc:mysql://10.4.3.223:3306/secooSeoDB?useUnicode=true&amp;characterEncoding=utf8&amp;zeroDateTimeBehavior=convertToNull seo.read.url=jdbc:mysql://secooSeoDB.master.com:3307/secooSeoDB?useUnicode=true&amp;characterEncoding=utf8&amp;zeroDateTimeBehavior=convertToNull
seo.read.user=SeoDB_test seo.read.user=sem_Seo_W
seo.read.password=Cxkfq57huej0fTpK seo.read.password=C2IiHfNKYpT1onsR
\ No newline at end of file
dw.read.url=jdbc:mysql://secooDataWarehouse.slave.com:3306/secooDataWarehouse?useUnicode=true&amp;characterEncoding=utf8&amp;zeroDateTimeBehavior=convertToNull
dw.read.user=Search_DataWar_R
dw.read.password=pY1P9zUj9x1M65ot5szo
\ No newline at end of file
手提
手提
提包
\ No newline at end of file
皮夹|钱包
皮夹|钱包
围脖|围巾
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment