Commit 5b449033 by 王明范

merge tag

parent c31088f6
......@@ -243,7 +243,7 @@ public class SuggestTask {
esSuggestKeywordMap.put(fillKeyword, buildDefaultEsSuggestKeywordInfo(fillKeyword));
}
}
mergeKeywordTag(esSuggestKeywordMap); // 处理部分keyword,合并为其他词的tag
// 过滤词
List<EsSuggestKeywordInfo> suggestKeywordInfoList = new ArrayList<>();
int processCount = 0;
......@@ -277,6 +277,73 @@ public class SuggestTask {
saveSuggestKeywordToEs(suggestKeywordInfoList);
}
}
private static void mergeKeywordTag(ConcurrentHashMap<String, EsSuggestKeywordInfo> esSuggestKeywordMap) {
int maxCount = 100;
List<String> keyList = new ArrayList<>(esSuggestKeywordMap.keySet());
Collections.sort(keyList); // 按照字符排序,确保扩展词都在本词后面
for(int i = 0;i< keyList.size(); i++) {
String word = keyList.get(i);
EsSuggestKeywordInfo suggestInfo = esSuggestKeywordMap.get(word);
int wordLen = StringUtils.getByteLength(word);
int length = word.length();
if (wordLen > 3 && wordLen <= 15 && StringUtils.isBlank(suggestInfo.getSuggestTags())) {
boolean isEN = StringUtils.isEnAndMidSpaceStr(word);
List<EsSuggestKeywordInfo> suggestList = new ArrayList<>();
int keyCount = 0;
for (int j = i + 1; j < keyList.size(); j++) {
if (keyCount > maxCount) {
break;
}
String fulWord = keyList.get(j);
int fulLen = StringUtils.getByteLength(fulWord);
if (fulWord.startsWith(word) && fulLen - wordLen > 3 && fulLen - wordLen <= 12) {
String subWord = fulWord.substring(length, length + 1);
String rightWord = fulWord.substring(length, fulWord.length()).trim();
if (isEN && StringUtils.isEnStr(subWord)){
continue;
}
int rightLen = StringUtils.getByteLength(rightWord);
if (rightLen - wordLen <= 3 || rightLen - wordLen > 12) {
break;
}
suggestList.add(esSuggestKeywordMap.get(fulWord));
keyCount++;
} else {
break;
}
}
if (suggestList.size() > 0) {
Collections.sort(suggestList, (t1, t2) -> {
Double score1 = t1.getWordABRank();
Double score2 = t2.getWordABRank();
if (score1 != null && score2 != null) {
return score2.compareTo(score1);
} else if (score1 == null && score2 == null) {
return 0;
} else if (score1 == null) {
return 1;
} else {
return -1;
}
});
StringBuffer sb = new StringBuffer("");
for (int k=0, count=0; k < suggestList.size() && count < 3; k++, count++) {
EsSuggestKeywordInfo info = suggestList.get(k);
String fulWord = info.getKeyword();
int fulLen = fulWord.length();
String subWord = fulWord.substring(length, fulLen).trim();
if (k > 0) {
sb.append(",");
}
sb.append(subWord);
}
if (sb.length() > 0) {
esSuggestKeywordMap.get(word).setSuggestTags(sb.toString());
}
}
}
}
}
private static EsSuggestKeywordInfo buildDefaultEsSuggestKeywordInfo(String keyword) {
EsSuggestKeywordInfo esSuggestKeywordInfo = new EsSuggestKeywordInfo();
......
......@@ -2067,6 +2067,17 @@ public abstract class StringUtils {
}
/**
* 判断是否是英文字符串,两边有空格认为不是英文,包含在中间的空格认为是英文
*/
public static boolean isEnAndMidSpaceStr(String word) {
if (word.startsWith(" ") || word.endsWith(" ")) {
return false;
}
boolean result = word.trim().matches("[a-zA-Z\\s]+");
return result;
}
/**
* 判断是否包含中文
*/
public static boolean isContainChStr(String word) {
......@@ -2109,6 +2120,19 @@ public abstract class StringUtils {
return 0;
}
public static void main(String[] arg) {
String word = "ab c ";
String word1 = "ab c 中文";
int wordLen = StringUtils.getByteLength(word);
int len = StringUtils.getByteLength(word1);
String aaa = word1.substring(wordLen, wordLen+1);
String bbb = word1.substring(wordLen, word1.length());
System.out.println(isEnAndMidSpaceStr(word));
System.out.println(isEnAndMidSpaceStr(bbb));
System.out.println(aaa);
System.out.println(bbb);
}
/**
* 32位md5加密
*/
......@@ -2153,7 +2177,7 @@ public abstract class StringUtils {
*
* <p>If the stripChars String is {@code null}, whitespace is
* stripped as defined by {@link Character#isWhitespace(char)}.
* Alternatively use {@link #strip(String)}.</p>
* Alternatively use {@link #strip(String, String)}.</p>
*
* <pre>
* StringUtils.strip(null, *) = null
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment