Commit b675b357 by 王明范

pass word with tag

parent e8646980
...@@ -37,6 +37,9 @@ public class SuggestTask { ...@@ -37,6 +37,9 @@ public class SuggestTask {
private static int maxTagSize = 5; private static int maxTagSize = 5;
private static long startTime = System.currentTimeMillis(); private static long startTime = System.currentTimeMillis();
private static Set<String> spWordSet = new HashSet<>(Arrays.asList("靴子","鞋子","裤子","袜子","裙子","帽子","杯子","箱子","包包","包袋","包带","表带"));
private static List<Set<String>> synonymList = new ArrayList<>();
public static void main(String[] args) { public static void main(String[] args) {
startTime = System.currentTimeMillis(); startTime = System.currentTimeMillis();
log.info(">>>>>>>>>>>> start run SuggestTask , startTime: " + startTime); log.info(">>>>>>>>>>>> start run SuggestTask , startTime: " + startTime);
...@@ -56,6 +59,9 @@ public class SuggestTask { ...@@ -56,6 +59,9 @@ public class SuggestTask {
sensitiveMap = loadSensitiveMap(); sensitiveMap = loadSensitiveMap();
europeWordMap = loadEuropeWordMap(); europeWordMap = loadEuropeWordMap();
// 加载表填同义词
loadTagSynonym();
// 加载搜索词并处理 // 加载搜索词并处理
processSuggestTask(startTime); processSuggestTask(startTime);
...@@ -180,6 +186,16 @@ public class SuggestTask { ...@@ -180,6 +186,16 @@ public class SuggestTask {
return prefixFilterList; return prefixFilterList;
} }
private static void loadTagSynonym(){
Set<String> maleWords = new HashSet<>(Arrays.asList("男性","男式","男士","男款","男"));
Set<String> femaleWords = new HashSet<>(Arrays.asList("女性","女式","女士","女款","女"));
if (synonymList == null) {
synonymList = new ArrayList<>();
}
synonymList.add(maleWords);
synonymList.add(femaleWords);
}
private static String cleanKeyword(String keyword) { private static String cleanKeyword(String keyword) {
if (keyword != null) { if (keyword != null) {
String fixKeyword = PinYinUtils.convertToSimplifiedChinese(keyword); String fixKeyword = PinYinUtils.convertToSimplifiedChinese(keyword);
...@@ -244,7 +260,9 @@ public class SuggestTask { ...@@ -244,7 +260,9 @@ public class SuggestTask {
esSuggestKeywordMap.put(fillKeyword, buildDefaultEsSuggestKeywordInfo(fillKeyword)); esSuggestKeywordMap.put(fillKeyword, buildDefaultEsSuggestKeywordInfo(fillKeyword));
} }
} }
mergeKeywordTag(esSuggestKeywordMap); // 处理部分keyword,合并为其他词的tag
ConcurrentHashMap<String, EsSuggestKeywordInfo> tmpSuggestKeywordMap = new ConcurrentHashMap<>();
// 过滤词 // 过滤词
List<EsSuggestKeywordInfo> suggestKeywordInfoList = new ArrayList<>(); List<EsSuggestKeywordInfo> suggestKeywordInfoList = new ArrayList<>();
int processCount = 0; int processCount = 0;
...@@ -262,6 +280,7 @@ public class SuggestTask { ...@@ -262,6 +280,7 @@ public class SuggestTask {
cleanBeforeSaveToEs(suggestKeywordInfo); cleanBeforeSaveToEs(suggestKeywordInfo);
suggestKeywordInfoList.add(suggestKeywordInfo); suggestKeywordInfoList.add(suggestKeywordInfo);
tmpSuggestKeywordMap.put(suggestKeywordInfo.getKeyword(), suggestKeywordInfo);
} }
processCount++; processCount++;
...@@ -269,7 +288,7 @@ public class SuggestTask { ...@@ -269,7 +288,7 @@ public class SuggestTask {
log.info("keyword filter process: {} / {}", processCount, totalCount); log.info("keyword filter process: {} / {}", processCount, totalCount);
} }
} }
mergeKeywordTag(tmpSuggestKeywordMap); // 处理部分keyword,合并为其他词的tag
if ("true".equalsIgnoreCase(System.getProperty("suggest.saveToFile"))) { if ("true".equalsIgnoreCase(System.getProperty("suggest.saveToFile"))) {
// save to file // save to file
saveSuggestKeywordToFile(suggestKeywordInfoList); saveSuggestKeywordToFile(suggestKeywordInfoList);
...@@ -292,7 +311,6 @@ public class SuggestTask { ...@@ -292,7 +311,6 @@ public class SuggestTask {
int wordLen = StringUtils.getByteLength(word); int wordLen = StringUtils.getByteLength(word);
int length = word.length(); int length = word.length();
if (wordLen > 3 && wordLen <= 18 && StringUtils.isBlank(suggestInfo.getSuggestTags())) { if (wordLen > 3 && wordLen <= 18 && StringUtils.isBlank(suggestInfo.getSuggestTags())) {
boolean isEN = StringUtils.isEnAndMidSpaceStr(word);
List<EsSuggestKeywordInfo> suggestList = new ArrayList<>(); List<EsSuggestKeywordInfo> suggestList = new ArrayList<>();
int keyCount = 0; int keyCount = 0;
// 获取包含word的suggest,满足条件的存入suggestList列表,限定最多100个词 // 获取包含word的suggest,满足条件的存入suggestList列表,限定最多100个词
...@@ -300,29 +318,18 @@ public class SuggestTask { ...@@ -300,29 +318,18 @@ public class SuggestTask {
if (keyCount > maxCount) { if (keyCount > maxCount) {
break; break;
} }
String fulWord = keyList.get(j); String fullWord = keyList.get(j);
EsSuggestKeywordInfo tmpSuggest = esSuggestKeywordMap.get(fulWord); EsSuggestKeywordInfo tmpSuggest = esSuggestKeywordMap.get(fullWord);
if (StringUtils.isNotBlank(tmpSuggest.getSuggestTags())) { if (StringUtils.isNotBlank(tmpSuggest.getSuggestTags())) {
continue; continue;
} }
int rightLen = StringUtils.getByteLength(fulWord) - wordLen; int rightLen = StringUtils.getByteLength(fullWord) - wordLen;
if (fulWord.startsWith(word)) { if (fullWord.startsWith(word)) {
if (rightLen > 3 && rightLen <= 14) { if (rightLen > 3 && rightLen <= 14) {
String subWord = fulWord.substring(length, length + 1); if (isSkipMergeTag(word, fullWord)) {
boolean isTShirt = false;
String rightWord = fulWord.substring(length, fulWord.length()).trim();
int realLen = StringUtils.getByteLength(rightWord);
if (rightWord.length() >= 2 && rightWord.toLowerCase().startsWith("t恤")) {
isTShirt = true;
}
if (isEN && (StringUtils.isEnStr(subWord) && !isTShirt)) {
continue;
}
if (realLen <= 3 || realLen > 14) {
continue; continue;
} }
suggestList.add(esSuggestKeywordMap.get(fulWord)); suggestList.add(esSuggestKeywordMap.get(fullWord));
keyCount++; keyCount++;
} }
} else { } else {
...@@ -344,27 +351,91 @@ public class SuggestTask { ...@@ -344,27 +351,91 @@ public class SuggestTask {
return -1; return -1;
} }
}); });
StringBuffer sb = new StringBuffer("");
for (int k=0, count=0; k < suggestList.size() && count < 3; k++, count++) { Set<String> tagSet = new HashSet<>();
int count=0;
for (int k=0; k < suggestList.size(); k++) {
EsSuggestKeywordInfo info = suggestList.get(k); EsSuggestKeywordInfo info = suggestList.get(k);
String fulWord = info.getKeyword(); String fullWord = info.getKeyword();
int fulLen = fulWord.length(); int fulLen = fullWord.length();
String subWord = fulWord.substring(length, fulLen).trim(); String subWord = fullWord.substring(length, fulLen).trim();
if (k > 0) { if(count < 3 && !isSkipSynonymTag(subWord, tagSet)) {
sb.append(","); tagSet.add(subWord);
count++;
} }
sb.append(subWord);
} }
if (sb.length() > 0) { if (tagSet.size() > 0) {
String tags = String.join(",", tagSet);
if ("lv女包".equals(word)) { if ("lv女包".equals(word)) {
log.info("debugLog keyword tag:" + sb.toString()); log.info("debugLog keyword tag:" + tags);
} }
suggestInfo.setSuggestTags(sb.toString()); suggestInfo.setSuggestTags(tags);
} }
} }
} }
} }
} }
private static boolean isSkipSynonymTag(String subWord, Set<String> tagSet) {
if (StringUtils.isBlank(subWord)) {
return true;
}
if (synonymList.size() > 0) {
for (Set<String> synonymSet : synonymList) {
if (synonymSet.contains(subWord)) {
// 同义词不支持多组中有交叉内容,一旦发现一组,就会跳出循环
for (String synonym : synonymSet) {
if (tagSet.contains(synonym)) {
return true;
}
}
break;
}
}
}
return false;
}
private static boolean isSkipMergeTag(String word, String fullWord) {
if (StringUtils.isBlank(word) || StringUtils.isBlank(fullWord) ) {
return true;
}
int length = word.length();
int fullLen = fullWord.length();
if (fullLen <= length) {
return true;
}
boolean isWordEnStr = StringUtils.isEnAndMidSpaceStr(word);
int wordByteLen = StringUtils.getByteLength(word);
if (wordByteLen <= 3 || wordByteLen > 18) {
return true;
}
String leftSubChar = word.substring(length - 1, length);
String rightSubChar = fullWord.substring(length, length + 1);
String rightWord = fullWord.substring(length, fullLen).trim();
int realByteLen = StringUtils.getByteLength(rightWord);
if (realByteLen <= 3 || realByteLen > 14) {
// 限制作为标签的长度
return true;
}
boolean isTShirt = false;
if (rightWord.length() >= 2 && rightWord.toLowerCase().startsWith("t恤")) {
isTShirt = true;
}
if (isWordEnStr && (StringUtils.isEnStr(rightSubChar) && !isTShirt)) {
// 左边是英文和右边词语也是英文开头,认为是同一个英文词,但是T恤除外
return true;
}
if (rightWord.startsWith("色")) {
// 通常前面带有颜色词,分开的不做标签
return true;
}
if (spWordSet.contains(leftSubChar + rightSubChar)) {
// 左边结尾和右边开头组成常见词语,不做标签
return true;
}
return false;
}
private static EsSuggestKeywordInfo buildDefaultEsSuggestKeywordInfo(String keyword) { private static EsSuggestKeywordInfo buildDefaultEsSuggestKeywordInfo(String keyword) {
EsSuggestKeywordInfo esSuggestKeywordInfo = new EsSuggestKeywordInfo(); EsSuggestKeywordInfo esSuggestKeywordInfo = new EsSuggestKeywordInfo();
......
...@@ -6,6 +6,7 @@ import com.alibaba.fastjson.JSONObject; ...@@ -6,6 +6,7 @@ import com.alibaba.fastjson.JSONObject;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.math.BigDecimal; import java.math.BigDecimal;
import java.nio.channels.Pipe;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.security.MessageDigest; import java.security.MessageDigest;
...@@ -2121,6 +2122,7 @@ public abstract class StringUtils { ...@@ -2121,6 +2122,7 @@ public abstract class StringUtils {
} }
public static void main(String[] arg) { public static void main(String[] arg) {
Set<String> spWordSet = new HashSet<>(Arrays.asList("靴子","鞋子","裤子","袜子","裙子","帽子","杯子","箱子","包包","包袋","包袋"));
String word = "ab c "; String word = "ab c ";
String word1 = "ab c 中文"; String word1 = "ab c 中文";
int wordLen = StringUtils.getByteLength(word); int wordLen = StringUtils.getByteLength(word);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment