Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
suggest-task
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
田川
suggest-task
Commits
b675b357
Commit
b675b357
authored
Aug 03, 2022
by
王明范
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
pass word with tag
parent
e8646980
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
104 additions
and
31 deletions
+104
-31
SuggestTask.java
suggest-task/src/main/java/com/secoo/so/suggest/task/SuggestTask.java
+102
-31
StringUtils.java
suggest-task/src/main/java/com/secoo/so/suggest/util/StringUtils.java
+2
-0
No files found.
suggest-task/src/main/java/com/secoo/so/suggest/task/SuggestTask.java
View file @
b675b357
...
@@ -37,6 +37,9 @@ public class SuggestTask {
...
@@ -37,6 +37,9 @@ public class SuggestTask {
private
static
int
maxTagSize
=
5
;
private
static
int
maxTagSize
=
5
;
private
static
long
startTime
=
System
.
currentTimeMillis
();
private
static
long
startTime
=
System
.
currentTimeMillis
();
private
static
Set
<
String
>
spWordSet
=
new
HashSet
<>(
Arrays
.
asList
(
"靴子"
,
"鞋子"
,
"裤子"
,
"袜子"
,
"裙子"
,
"帽子"
,
"杯子"
,
"箱子"
,
"包包"
,
"包袋"
,
"包带"
,
"表带"
));
private
static
List
<
Set
<
String
>>
synonymList
=
new
ArrayList
<>();
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
startTime
=
System
.
currentTimeMillis
();
startTime
=
System
.
currentTimeMillis
();
log
.
info
(
">>>>>>>>>>>> start run SuggestTask , startTime: "
+
startTime
);
log
.
info
(
">>>>>>>>>>>> start run SuggestTask , startTime: "
+
startTime
);
...
@@ -56,6 +59,9 @@ public class SuggestTask {
...
@@ -56,6 +59,9 @@ public class SuggestTask {
sensitiveMap
=
loadSensitiveMap
();
sensitiveMap
=
loadSensitiveMap
();
europeWordMap
=
loadEuropeWordMap
();
europeWordMap
=
loadEuropeWordMap
();
// 加载表填同义词
loadTagSynonym
();
// 加载搜索词并处理
// 加载搜索词并处理
processSuggestTask
(
startTime
);
processSuggestTask
(
startTime
);
...
@@ -180,6 +186,16 @@ public class SuggestTask {
...
@@ -180,6 +186,16 @@ public class SuggestTask {
return
prefixFilterList
;
return
prefixFilterList
;
}
}
private
static
void
loadTagSynonym
(){
Set
<
String
>
maleWords
=
new
HashSet
<>(
Arrays
.
asList
(
"男性"
,
"男式"
,
"男士"
,
"男款"
,
"男"
));
Set
<
String
>
femaleWords
=
new
HashSet
<>(
Arrays
.
asList
(
"女性"
,
"女式"
,
"女士"
,
"女款"
,
"女"
));
if
(
synonymList
==
null
)
{
synonymList
=
new
ArrayList
<>();
}
synonymList
.
add
(
maleWords
);
synonymList
.
add
(
femaleWords
);
}
private
static
String
cleanKeyword
(
String
keyword
)
{
private
static
String
cleanKeyword
(
String
keyword
)
{
if
(
keyword
!=
null
)
{
if
(
keyword
!=
null
)
{
String
fixKeyword
=
PinYinUtils
.
convertToSimplifiedChinese
(
keyword
);
String
fixKeyword
=
PinYinUtils
.
convertToSimplifiedChinese
(
keyword
);
...
@@ -244,7 +260,9 @@ public class SuggestTask {
...
@@ -244,7 +260,9 @@ public class SuggestTask {
esSuggestKeywordMap
.
put
(
fillKeyword
,
buildDefaultEsSuggestKeywordInfo
(
fillKeyword
));
esSuggestKeywordMap
.
put
(
fillKeyword
,
buildDefaultEsSuggestKeywordInfo
(
fillKeyword
));
}
}
}
}
mergeKeywordTag
(
esSuggestKeywordMap
);
// 处理部分keyword,合并为其他词的tag
ConcurrentHashMap
<
String
,
EsSuggestKeywordInfo
>
tmpSuggestKeywordMap
=
new
ConcurrentHashMap
<>();
// 过滤词
// 过滤词
List
<
EsSuggestKeywordInfo
>
suggestKeywordInfoList
=
new
ArrayList
<>();
List
<
EsSuggestKeywordInfo
>
suggestKeywordInfoList
=
new
ArrayList
<>();
int
processCount
=
0
;
int
processCount
=
0
;
...
@@ -262,6 +280,7 @@ public class SuggestTask {
...
@@ -262,6 +280,7 @@ public class SuggestTask {
cleanBeforeSaveToEs
(
suggestKeywordInfo
);
cleanBeforeSaveToEs
(
suggestKeywordInfo
);
suggestKeywordInfoList
.
add
(
suggestKeywordInfo
);
suggestKeywordInfoList
.
add
(
suggestKeywordInfo
);
tmpSuggestKeywordMap
.
put
(
suggestKeywordInfo
.
getKeyword
(),
suggestKeywordInfo
);
}
}
processCount
++;
processCount
++;
...
@@ -269,7 +288,7 @@ public class SuggestTask {
...
@@ -269,7 +288,7 @@ public class SuggestTask {
log
.
info
(
"keyword filter process: {} / {}"
,
processCount
,
totalCount
);
log
.
info
(
"keyword filter process: {} / {}"
,
processCount
,
totalCount
);
}
}
}
}
mergeKeywordTag
(
tmpSuggestKeywordMap
);
// 处理部分keyword,合并为其他词的tag
if
(
"true"
.
equalsIgnoreCase
(
System
.
getProperty
(
"suggest.saveToFile"
)))
{
if
(
"true"
.
equalsIgnoreCase
(
System
.
getProperty
(
"suggest.saveToFile"
)))
{
// save to file
// save to file
saveSuggestKeywordToFile
(
suggestKeywordInfoList
);
saveSuggestKeywordToFile
(
suggestKeywordInfoList
);
...
@@ -292,7 +311,6 @@ public class SuggestTask {
...
@@ -292,7 +311,6 @@ public class SuggestTask {
int
wordLen
=
StringUtils
.
getByteLength
(
word
);
int
wordLen
=
StringUtils
.
getByteLength
(
word
);
int
length
=
word
.
length
();
int
length
=
word
.
length
();
if
(
wordLen
>
3
&&
wordLen
<=
18
&&
StringUtils
.
isBlank
(
suggestInfo
.
getSuggestTags
()))
{
if
(
wordLen
>
3
&&
wordLen
<=
18
&&
StringUtils
.
isBlank
(
suggestInfo
.
getSuggestTags
()))
{
boolean
isEN
=
StringUtils
.
isEnAndMidSpaceStr
(
word
);
List
<
EsSuggestKeywordInfo
>
suggestList
=
new
ArrayList
<>();
List
<
EsSuggestKeywordInfo
>
suggestList
=
new
ArrayList
<>();
int
keyCount
=
0
;
int
keyCount
=
0
;
// 获取包含word的suggest,满足条件的存入suggestList列表,限定最多100个词
// 获取包含word的suggest,满足条件的存入suggestList列表,限定最多100个词
...
@@ -300,29 +318,18 @@ public class SuggestTask {
...
@@ -300,29 +318,18 @@ public class SuggestTask {
if
(
keyCount
>
maxCount
)
{
if
(
keyCount
>
maxCount
)
{
break
;
break
;
}
}
String
fulWord
=
keyList
.
get
(
j
);
String
ful
l
Word
=
keyList
.
get
(
j
);
EsSuggestKeywordInfo
tmpSuggest
=
esSuggestKeywordMap
.
get
(
fulWord
);
EsSuggestKeywordInfo
tmpSuggest
=
esSuggestKeywordMap
.
get
(
ful
l
Word
);
if
(
StringUtils
.
isNotBlank
(
tmpSuggest
.
getSuggestTags
()))
{
if
(
StringUtils
.
isNotBlank
(
tmpSuggest
.
getSuggestTags
()))
{
continue
;
continue
;
}
}
int
rightLen
=
StringUtils
.
getByteLength
(
fulWord
)
-
wordLen
;
int
rightLen
=
StringUtils
.
getByteLength
(
ful
l
Word
)
-
wordLen
;
if
(
fulWord
.
startsWith
(
word
))
{
if
(
ful
l
Word
.
startsWith
(
word
))
{
if
(
rightLen
>
3
&&
rightLen
<=
14
)
{
if
(
rightLen
>
3
&&
rightLen
<=
14
)
{
String
subWord
=
fulWord
.
substring
(
length
,
length
+
1
);
if
(
isSkipMergeTag
(
word
,
fullWord
))
{
boolean
isTShirt
=
false
;
String
rightWord
=
fulWord
.
substring
(
length
,
fulWord
.
length
()).
trim
();
int
realLen
=
StringUtils
.
getByteLength
(
rightWord
);
if
(
rightWord
.
length
()
>=
2
&&
rightWord
.
toLowerCase
().
startsWith
(
"t恤"
))
{
isTShirt
=
true
;
}
if
(
isEN
&&
(
StringUtils
.
isEnStr
(
subWord
)
&&
!
isTShirt
))
{
continue
;
}
if
(
realLen
<=
3
||
realLen
>
14
)
{
continue
;
continue
;
}
}
suggestList
.
add
(
esSuggestKeywordMap
.
get
(
fulWord
));
suggestList
.
add
(
esSuggestKeywordMap
.
get
(
ful
l
Word
));
keyCount
++;
keyCount
++;
}
}
}
else
{
}
else
{
...
@@ -344,27 +351,91 @@ public class SuggestTask {
...
@@ -344,27 +351,91 @@ public class SuggestTask {
return
-
1
;
return
-
1
;
}
}
});
});
StringBuffer
sb
=
new
StringBuffer
(
""
);
for
(
int
k
=
0
,
count
=
0
;
k
<
suggestList
.
size
()
&&
count
<
3
;
k
++,
count
++)
{
Set
<
String
>
tagSet
=
new
HashSet
<>();
int
count
=
0
;
for
(
int
k
=
0
;
k
<
suggestList
.
size
();
k
++)
{
EsSuggestKeywordInfo
info
=
suggestList
.
get
(
k
);
EsSuggestKeywordInfo
info
=
suggestList
.
get
(
k
);
String
fulWord
=
info
.
getKeyword
();
String
fullWord
=
info
.
getKeyword
();
int
fulLen
=
fulWord
.
length
();
int
fulLen
=
fullWord
.
length
();
String
subWord
=
fulWord
.
substring
(
length
,
fulLen
).
trim
();
String
subWord
=
fullWord
.
substring
(
length
,
fulLen
).
trim
();
if
(
k
>
0
)
{
if
(
count
<
3
&&
!
isSkipSynonymTag
(
subWord
,
tagSet
))
{
sb
.
append
(
","
);
tagSet
.
add
(
subWord
);
count
++;
}
}
sb
.
append
(
subWord
);
}
}
if
(
sb
.
length
()
>
0
)
{
if
(
tagSet
.
size
()
>
0
)
{
String
tags
=
String
.
join
(
","
,
tagSet
);
if
(
"lv女包"
.
equals
(
word
))
{
if
(
"lv女包"
.
equals
(
word
))
{
log
.
info
(
"debugLog keyword tag:"
+
sb
.
toString
()
);
log
.
info
(
"debugLog keyword tag:"
+
tags
);
}
}
suggestInfo
.
setSuggestTags
(
sb
.
toString
()
);
suggestInfo
.
setSuggestTags
(
tags
);
}
}
}
}
}
}
}
}
}
}
private
static
boolean
isSkipSynonymTag
(
String
subWord
,
Set
<
String
>
tagSet
)
{
if
(
StringUtils
.
isBlank
(
subWord
))
{
return
true
;
}
if
(
synonymList
.
size
()
>
0
)
{
for
(
Set
<
String
>
synonymSet
:
synonymList
)
{
if
(
synonymSet
.
contains
(
subWord
))
{
// 同义词不支持多组中有交叉内容,一旦发现一组,就会跳出循环
for
(
String
synonym
:
synonymSet
)
{
if
(
tagSet
.
contains
(
synonym
))
{
return
true
;
}
}
break
;
}
}
}
return
false
;
}
private
static
boolean
isSkipMergeTag
(
String
word
,
String
fullWord
)
{
if
(
StringUtils
.
isBlank
(
word
)
||
StringUtils
.
isBlank
(
fullWord
)
)
{
return
true
;
}
int
length
=
word
.
length
();
int
fullLen
=
fullWord
.
length
();
if
(
fullLen
<=
length
)
{
return
true
;
}
boolean
isWordEnStr
=
StringUtils
.
isEnAndMidSpaceStr
(
word
);
int
wordByteLen
=
StringUtils
.
getByteLength
(
word
);
if
(
wordByteLen
<=
3
||
wordByteLen
>
18
)
{
return
true
;
}
String
leftSubChar
=
word
.
substring
(
length
-
1
,
length
);
String
rightSubChar
=
fullWord
.
substring
(
length
,
length
+
1
);
String
rightWord
=
fullWord
.
substring
(
length
,
fullLen
).
trim
();
int
realByteLen
=
StringUtils
.
getByteLength
(
rightWord
);
if
(
realByteLen
<=
3
||
realByteLen
>
14
)
{
// 限制作为标签的长度
return
true
;
}
boolean
isTShirt
=
false
;
if
(
rightWord
.
length
()
>=
2
&&
rightWord
.
toLowerCase
().
startsWith
(
"t恤"
))
{
isTShirt
=
true
;
}
if
(
isWordEnStr
&&
(
StringUtils
.
isEnStr
(
rightSubChar
)
&&
!
isTShirt
))
{
// 左边是英文和右边词语也是英文开头,认为是同一个英文词,但是T恤除外
return
true
;
}
if
(
rightWord
.
startsWith
(
"色"
))
{
// 通常前面带有颜色词,分开的不做标签
return
true
;
}
if
(
spWordSet
.
contains
(
leftSubChar
+
rightSubChar
))
{
// 左边结尾和右边开头组成常见词语,不做标签
return
true
;
}
return
false
;
}
private
static
EsSuggestKeywordInfo
buildDefaultEsSuggestKeywordInfo
(
String
keyword
)
{
private
static
EsSuggestKeywordInfo
buildDefaultEsSuggestKeywordInfo
(
String
keyword
)
{
EsSuggestKeywordInfo
esSuggestKeywordInfo
=
new
EsSuggestKeywordInfo
();
EsSuggestKeywordInfo
esSuggestKeywordInfo
=
new
EsSuggestKeywordInfo
();
...
...
suggest-task/src/main/java/com/secoo/so/suggest/util/StringUtils.java
View file @
b675b357
...
@@ -6,6 +6,7 @@ import com.alibaba.fastjson.JSONObject;
...
@@ -6,6 +6,7 @@ import com.alibaba.fastjson.JSONObject;
import
java.io.UnsupportedEncodingException
;
import
java.io.UnsupportedEncodingException
;
import
java.math.BigDecimal
;
import
java.math.BigDecimal
;
import
java.nio.channels.Pipe
;
import
java.nio.charset.Charset
;
import
java.nio.charset.Charset
;
import
java.nio.charset.StandardCharsets
;
import
java.nio.charset.StandardCharsets
;
import
java.security.MessageDigest
;
import
java.security.MessageDigest
;
...
@@ -2121,6 +2122,7 @@ public abstract class StringUtils {
...
@@ -2121,6 +2122,7 @@ public abstract class StringUtils {
}
}
public
static
void
main
(
String
[]
arg
)
{
public
static
void
main
(
String
[]
arg
)
{
Set
<
String
>
spWordSet
=
new
HashSet
<>(
Arrays
.
asList
(
"靴子"
,
"鞋子"
,
"裤子"
,
"袜子"
,
"裙子"
,
"帽子"
,
"杯子"
,
"箱子"
,
"包包"
,
"包袋"
,
"包袋"
));
String
word
=
"ab c "
;
String
word
=
"ab c "
;
String
word1
=
"ab c 中文"
;
String
word1
=
"ab c 中文"
;
int
wordLen
=
StringUtils
.
getByteLength
(
word
);
int
wordLen
=
StringUtils
.
getByteLength
(
word
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment