Commit b675b357 by 王明范

pass word with tag

parent e8646980
......@@ -37,6 +37,9 @@ public class SuggestTask {
private static int maxTagSize = 5;
private static long startTime = System.currentTimeMillis();
private static Set<String> spWordSet = new HashSet<>(Arrays.asList("靴子","鞋子","裤子","袜子","裙子","帽子","杯子","箱子","包包","包袋","包带","表带"));
private static List<Set<String>> synonymList = new ArrayList<>();
public static void main(String[] args) {
startTime = System.currentTimeMillis();
log.info(">>>>>>>>>>>> start run SuggestTask , startTime: " + startTime);
......@@ -56,6 +59,9 @@ public class SuggestTask {
sensitiveMap = loadSensitiveMap();
europeWordMap = loadEuropeWordMap();
// 加载表填同义词
loadTagSynonym();
// 加载搜索词并处理
processSuggestTask(startTime);
......@@ -180,6 +186,16 @@ public class SuggestTask {
return prefixFilterList;
}
private static void loadTagSynonym(){
Set<String> maleWords = new HashSet<>(Arrays.asList("男性","男式","男士","男款","男"));
Set<String> femaleWords = new HashSet<>(Arrays.asList("女性","女式","女士","女款","女"));
if (synonymList == null) {
synonymList = new ArrayList<>();
}
synonymList.add(maleWords);
synonymList.add(femaleWords);
}
private static String cleanKeyword(String keyword) {
if (keyword != null) {
String fixKeyword = PinYinUtils.convertToSimplifiedChinese(keyword);
......@@ -244,7 +260,9 @@ public class SuggestTask {
esSuggestKeywordMap.put(fillKeyword, buildDefaultEsSuggestKeywordInfo(fillKeyword));
}
}
mergeKeywordTag(esSuggestKeywordMap); // 处理部分keyword,合并为其他词的tag
ConcurrentHashMap<String, EsSuggestKeywordInfo> tmpSuggestKeywordMap = new ConcurrentHashMap<>();
// 过滤词
List<EsSuggestKeywordInfo> suggestKeywordInfoList = new ArrayList<>();
int processCount = 0;
......@@ -262,6 +280,7 @@ public class SuggestTask {
cleanBeforeSaveToEs(suggestKeywordInfo);
suggestKeywordInfoList.add(suggestKeywordInfo);
tmpSuggestKeywordMap.put(suggestKeywordInfo.getKeyword(), suggestKeywordInfo);
}
processCount++;
......@@ -269,7 +288,7 @@ public class SuggestTask {
log.info("keyword filter process: {} / {}", processCount, totalCount);
}
}
mergeKeywordTag(tmpSuggestKeywordMap); // 处理部分keyword,合并为其他词的tag
if ("true".equalsIgnoreCase(System.getProperty("suggest.saveToFile"))) {
// save to file
saveSuggestKeywordToFile(suggestKeywordInfoList);
......@@ -292,7 +311,6 @@ public class SuggestTask {
int wordLen = StringUtils.getByteLength(word);
int length = word.length();
if (wordLen > 3 && wordLen <= 18 && StringUtils.isBlank(suggestInfo.getSuggestTags())) {
boolean isEN = StringUtils.isEnAndMidSpaceStr(word);
List<EsSuggestKeywordInfo> suggestList = new ArrayList<>();
int keyCount = 0;
// 获取包含word的suggest,满足条件的存入suggestList列表,限定最多100个词
......@@ -300,29 +318,18 @@ public class SuggestTask {
if (keyCount > maxCount) {
break;
}
String fulWord = keyList.get(j);
EsSuggestKeywordInfo tmpSuggest = esSuggestKeywordMap.get(fulWord);
String fullWord = keyList.get(j);
EsSuggestKeywordInfo tmpSuggest = esSuggestKeywordMap.get(fullWord);
if (StringUtils.isNotBlank(tmpSuggest.getSuggestTags())) {
continue;
}
int rightLen = StringUtils.getByteLength(fulWord) - wordLen;
if (fulWord.startsWith(word)) {
int rightLen = StringUtils.getByteLength(fullWord) - wordLen;
if (fullWord.startsWith(word)) {
if (rightLen > 3 && rightLen <= 14) {
String subWord = fulWord.substring(length, length + 1);
boolean isTShirt = false;
String rightWord = fulWord.substring(length, fulWord.length()).trim();
int realLen = StringUtils.getByteLength(rightWord);
if (rightWord.length() >= 2 && rightWord.toLowerCase().startsWith("t恤")) {
isTShirt = true;
}
if (isEN && (StringUtils.isEnStr(subWord) && !isTShirt)) {
continue;
}
if (realLen <= 3 || realLen > 14) {
if (isSkipMergeTag(word, fullWord)) {
continue;
}
suggestList.add(esSuggestKeywordMap.get(fulWord));
suggestList.add(esSuggestKeywordMap.get(fullWord));
keyCount++;
}
} else {
......@@ -344,27 +351,91 @@ public class SuggestTask {
return -1;
}
});
StringBuffer sb = new StringBuffer("");
for (int k=0, count=0; k < suggestList.size() && count < 3; k++, count++) {
Set<String> tagSet = new HashSet<>();
int count=0;
for (int k=0; k < suggestList.size(); k++) {
EsSuggestKeywordInfo info = suggestList.get(k);
String fulWord = info.getKeyword();
int fulLen = fulWord.length();
String subWord = fulWord.substring(length, fulLen).trim();
if (k > 0) {
sb.append(",");
String fullWord = info.getKeyword();
int fulLen = fullWord.length();
String subWord = fullWord.substring(length, fulLen).trim();
if(count < 3 && !isSkipSynonymTag(subWord, tagSet)) {
tagSet.add(subWord);
count++;
}
sb.append(subWord);
}
if (sb.length() > 0) {
if (tagSet.size() > 0) {
String tags = String.join(",", tagSet);
if ("lv女包".equals(word)) {
log.info("debugLog keyword tag:" + sb.toString());
log.info("debugLog keyword tag:" + tags);
}
suggestInfo.setSuggestTags(sb.toString());
suggestInfo.setSuggestTags(tags);
}
}
}
}
}
private static boolean isSkipSynonymTag(String subWord, Set<String> tagSet) {
if (StringUtils.isBlank(subWord)) {
return true;
}
if (synonymList.size() > 0) {
for (Set<String> synonymSet : synonymList) {
if (synonymSet.contains(subWord)) {
// 同义词不支持多组中有交叉内容,一旦发现一组,就会跳出循环
for (String synonym : synonymSet) {
if (tagSet.contains(synonym)) {
return true;
}
}
break;
}
}
}
return false;
}
private static boolean isSkipMergeTag(String word, String fullWord) {
if (StringUtils.isBlank(word) || StringUtils.isBlank(fullWord) ) {
return true;
}
int length = word.length();
int fullLen = fullWord.length();
if (fullLen <= length) {
return true;
}
boolean isWordEnStr = StringUtils.isEnAndMidSpaceStr(word);
int wordByteLen = StringUtils.getByteLength(word);
if (wordByteLen <= 3 || wordByteLen > 18) {
return true;
}
String leftSubChar = word.substring(length - 1, length);
String rightSubChar = fullWord.substring(length, length + 1);
String rightWord = fullWord.substring(length, fullLen).trim();
int realByteLen = StringUtils.getByteLength(rightWord);
if (realByteLen <= 3 || realByteLen > 14) {
// 限制作为标签的长度
return true;
}
boolean isTShirt = false;
if (rightWord.length() >= 2 && rightWord.toLowerCase().startsWith("t恤")) {
isTShirt = true;
}
if (isWordEnStr && (StringUtils.isEnStr(rightSubChar) && !isTShirt)) {
// 左边是英文和右边词语也是英文开头,认为是同一个英文词,但是T恤除外
return true;
}
if (rightWord.startsWith("色")) {
// 通常前面带有颜色词,分开的不做标签
return true;
}
if (spWordSet.contains(leftSubChar + rightSubChar)) {
// 左边结尾和右边开头组成常见词语,不做标签
return true;
}
return false;
}
private static EsSuggestKeywordInfo buildDefaultEsSuggestKeywordInfo(String keyword) {
EsSuggestKeywordInfo esSuggestKeywordInfo = new EsSuggestKeywordInfo();
......
......@@ -6,6 +6,7 @@ import com.alibaba.fastjson.JSONObject;
import java.io.UnsupportedEncodingException;
import java.math.BigDecimal;
import java.nio.channels.Pipe;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
......@@ -2121,6 +2122,7 @@ public abstract class StringUtils {
}
public static void main(String[] arg) {
Set<String> spWordSet = new HashSet<>(Arrays.asList("靴子","鞋子","裤子","袜子","裙子","帽子","杯子","箱子","包包","包袋","包袋"));
String word = "ab c ";
String word1 = "ab c 中文";
int wordLen = StringUtils.getByteLength(word);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment