Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
suggest-task
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
田川
suggest-task
Commits
099f58c5
Commit
099f58c5
authored
Aug 08, 2022
by
王明范
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'TECH-SEARCH-merge-keyword-tag' into 'master'
Tech search merge keyword tag See merge request
!4
parents
c31088f6
70d2074b
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
671 additions
and
16 deletions
+671
-16
pom.xml
suggest-task/pom.xml
+35
-1
SqpDubboClient.java
suggest-task/src/main/java/com/secoo/so/suggest/client/SqpDubboClient.java
+122
-0
QueryPlanHelper.java
suggest-task/src/main/java/com/secoo/so/suggest/helper/QueryPlanHelper.java
+180
-0
WordHelper.java
suggest-task/src/main/java/com/secoo/so/suggest/helper/WordHelper.java
+68
-0
SuggestTask.java
suggest-task/src/main/java/com/secoo/so/suggest/task/SuggestTask.java
+204
-0
StringUtils.java
suggest-task/src/main/java/com/secoo/so/suggest/util/StringUtils.java
+27
-1
config.properties
suggest-task/src/main/profiles/prod/config.properties
+4
-0
config.properties
suggest-task/src/main/profiles/test/config.properties
+12
-7
db.properties
suggest-task/src/main/profiles/test/db.properties
+11
-7
specialWord示例.txt
suggest-task/src/main/resources/specialWord示例.txt
+4
-0
synonymTag示例.txt
suggest-task/src/main/resources/synonymTag示例.txt
+4
-0
No files found.
suggest-task/pom.xml
View file @
099f58c5
...
@@ -36,7 +36,41 @@
...
@@ -36,7 +36,41 @@
</profiles>
</profiles>
<dependencies>
<dependencies>
<dependency>
<groupId>
com.secoo
</groupId>
<artifactId>
sqp4j-client
</artifactId>
<exclusions>
<exclusion>
<groupId>
org.slf4j
</groupId>
<artifactId>
slf4j-log4j12
</artifactId>
</exclusion>
<exclusion>
<groupId>
log4j
</groupId>
<artifactId>
log4j
</artifactId>
</exclusion>
<exclusion>
<artifactId>
secoo-log
</artifactId>
<groupId>
com.secoo.mall
</groupId>
</exclusion>
</exclusions>
<version>
2.9.6.RELEASE
</version>
</dependency>
<dependency>
<groupId>
com.alibaba
</groupId>
<artifactId>
dubbo
</artifactId>
<version>
2.6.0
</version>
<exclusions>
<exclusion>
<artifactId>
spring-context
</artifactId>
<groupId>
org.springframework
</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>
com.github.sgroschupf
</groupId>
<artifactId>
zkclient
</artifactId>
<version>
0.1
</version>
</dependency>
<dependency>
<dependency>
<groupId>
mysql
</groupId>
<groupId>
mysql
</groupId>
<artifactId>
mysql-connector-java
</artifactId>
<artifactId>
mysql-connector-java
</artifactId>
...
...
suggest-task/src/main/java/com/secoo/so/suggest/client/SqpDubboClient.java
0 → 100644
View file @
099f58c5
package
com
.
secoo
.
so
.
suggest
.
client
;
import
com.alibaba.dubbo.config.ApplicationConfig
;
import
com.alibaba.dubbo.config.ReferenceConfig
;
import
com.alibaba.dubbo.config.RegistryConfig
;
import
com.secoo.search.sqp4j.QueryPlan
;
import
org.apache.log4j.Logger
;
import
java.util.Map
;
/**
* @author wangmingfan
* @date 2020/8/17
* @description sqp dubbo client
*/
public
class
SqpDubboClient
{
private
static
final
Logger
loger
=
Logger
.
getLogger
(
SqpDubboClient
.
class
);
private
static
ReferenceConfig
<
QueryPlan
>
dubboSqpReferenceConfigProd
=
null
;
private
static
ReferenceConfig
<
QueryPlan
>
dubboSqpReferenceConfigUat
=
null
;
private
static
ReferenceConfig
<
QueryPlan
>
dubboSqpReferenceConfigTest
=
null
;
public
static
QueryPlan
getDirectImpl
(
String
url
,
Map
<
String
,
String
>
map
){
ReferenceConfig
<
QueryPlan
>
impl
=
directDubboSqpReferenceConfig
(
url
);
QueryPlan
dubboSqp
=
impl
.
get
();
map
.
put
(
"Client"
,
impl
.
getClient
());
map
.
put
(
"Interface"
,
impl
.
getInterface
());
map
.
put
(
"Protocol"
,
impl
.
getProtocol
());
map
.
put
(
"Url"
,
impl
.
getUrl
());
map
.
put
(
"Cluster"
,
impl
.
getCluster
());
return
dubboSqp
;
}
public
static
QueryPlan
getTestImpl
(){
ReferenceConfig
<
QueryPlan
>
impl
=
getTestDubboSqpReferenceConfig
();
//impl.setGroup(""); //测试test环境未设置group
QueryPlan
dubboSqp
=
impl
.
get
();
return
dubboSqp
;
}
public
static
QueryPlan
getUatImpl
()
{
ReferenceConfig
<
QueryPlan
>
impl
=
getDubboSqpReferenceConfigUat
();
QueryPlan
dubboSqp
=
impl
.
get
();
return
dubboSqp
;
}
public
static
QueryPlan
getProdImpl
()
{
ReferenceConfig
<
QueryPlan
>
impl
=
getDubboSqpReferenceConfigProd
();
QueryPlan
dubboSqp
=
impl
.
get
();
return
dubboSqp
;
}
private
static
ReferenceConfig
<
QueryPlan
>
getDubboSqpReferenceConfigProd
()
{
if
(
dubboSqpReferenceConfigProd
==
null
)
{
dubboSqpReferenceConfigProd
=
buildDubboSqpReferenceConfig
();
}
return
dubboSqpReferenceConfigProd
;
}
private
static
ReferenceConfig
<
QueryPlan
>
getDubboSqpReferenceConfigUat
()
{
if
(
dubboSqpReferenceConfigUat
==
null
)
{
dubboSqpReferenceConfigUat
=
buildDubboSqpReferenceConfig
();
dubboSqpReferenceConfigUat
.
setGroup
(
"grey"
);
}
return
dubboSqpReferenceConfigUat
;
}
private
static
ReferenceConfig
<
QueryPlan
>
buildDubboSqpReferenceConfig
()
{
ReferenceConfig
<
QueryPlan
>
impl
=
new
ReferenceConfig
<
QueryPlan
>();
impl
.
setProtocol
(
"dubbo"
);
impl
.
setApplication
(
new
ApplicationConfig
(
"sem-test-tool"
));
RegistryConfig
registryConfig
=
new
RegistryConfig
(
"zookeeper://zk-mall1.secoolocal.com:5181?backup=zk-mall2.secoolocal.com:5181,zk-mall3.secoolocal.com:5181"
);
registryConfig
.
setProtocol
(
"zookeeper"
);
registryConfig
.
setClient
(
"zkclient"
);
impl
.
setRegistry
(
registryConfig
);
// impl.setMonitor(new MonitorConfig() { { setProtocol("registry"); } });
impl
.
setVersion
(
"1.0.0"
);
impl
.
setInterface
(
"com.secoo.search.sqp4j.QueryPlan"
);
return
impl
;
}
/**
* 连接测试环境dubbo
* @return com.alibaba.dubbo.config.ReferenceConfig<com.secoo.search.sqp4j.QueryPlan>
* @author wangmingfan
* @date 2020/8/17
*/
private
static
ReferenceConfig
<
QueryPlan
>
getTestDubboSqpReferenceConfig
()
{
if
(
dubboSqpReferenceConfigTest
==
null
)
{
ReferenceConfig
<
QueryPlan
>
impl
=
new
ReferenceConfig
<
QueryPlan
>();
impl
.
setProtocol
(
"dubbo"
);
impl
.
setApplication
(
new
ApplicationConfig
(
"sem-test-tool"
));
RegistryConfig
registryConfig
=
new
RegistryConfig
(
"zookeeper://10.185.240.81:2181?backup=10.185.240.82:2181,10.185.240.83:2181"
);
registryConfig
.
setProtocol
(
"zookeeper"
);
registryConfig
.
setClient
(
"zkclient"
);
impl
.
setRegistry
(
registryConfig
);
// impl.setMonitor(new MonitorConfig() { { setProtocol("registry"); } });
impl
.
setVersion
(
"1.0.0"
);
impl
.
setInterface
(
"com.secoo.search.sqp4j.QueryPlan"
);
dubboSqpReferenceConfigTest
=
impl
;
}
return
dubboSqpReferenceConfigTest
;
}
/**
* 直连dubbo服务
* @param Url url格式:ip:端口号
* @return com.alibaba.dubbo.config.ReferenceConfig<com.secoo.search.sqp4j.QueryPlan>
* @author wangmingfan
* @date 2020/4/6
*/
private
static
ReferenceConfig
<
QueryPlan
>
directDubboSqpReferenceConfig
(
String
Url
)
{
ReferenceConfig
<
QueryPlan
>
impl
=
new
ReferenceConfig
<
QueryPlan
>();
impl
.
setProtocol
(
"dubbo"
);
impl
.
setApplication
(
new
ApplicationConfig
(
"sem-test-tool"
));
impl
.
setUrl
(
Url
);
//"10.185.240.158:20062"
//impl.setMonitor(new MonitorConfig() { { setProtocol("registry"); } });
impl
.
setVersion
(
"1.0.0"
);
impl
.
setInterface
(
"com.secoo.search.sqp4j.QueryPlan"
);
return
impl
;
}
}
suggest-task/src/main/java/com/secoo/so/suggest/helper/QueryPlanHelper.java
0 → 100644
View file @
099f58c5
package
com
.
secoo
.
so
.
suggest
.
helper
;
import
com.secoo.abtest.common.Buckets
;
import
com.secoo.search.sqp4j.Explanation
;
import
com.secoo.search.sqp4j.Explanations
;
import
com.secoo.search.sqp4j.QueryPlan
;
import
com.secoo.search.sqp4j.QueryWord
;
import
com.secoo.search.sqp4j.client.QueryPlanClient
;
import
com.secoo.so.suggest.client.SqpDubboClient
;
import
com.secoo.so.suggest.config.ConfigUtil
;
import
com.secoo.so.suggest.util.FileUtils
;
import
com.secoo.so.suggest.util.StringUtils
;
import
org.apache.lucene.queryparser.classic.QueryParser
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
java.util.*
;
/**
* @author wangmingfan
* @date 2022/8/4
* @description
*/
public
class
QueryPlanHelper
{
private
static
final
Logger
LOG
=
LoggerFactory
.
getLogger
(
QueryPlanHelper
.
class
);
private
static
volatile
QueryPlanHelper
instance
;
private
static
QueryPlan
client
=
null
;
Map
<
String
,
Explanation
>
sqpCache
=
new
HashMap
<>();
List
<
String
>
wordList
=
new
ArrayList
<>();
Map
<
String
,
Integer
>
keywordMap
=
new
HashMap
<>();
private
static
String
queryPlanFile
=
""
;
///data/crontab/test/tmp/queryplan.txt
private
static
List
<
String
>
newWordLines
=
new
ArrayList
<>();
private
static
long
minTimeStamp
=
Long
.
MAX_VALUE
;
private
QueryPlanHelper
()
{
queryPlanFile
=
ConfigUtil
.
getString
(
"queryPlan.cachePath"
,
""
);
LOG
.
info
(
"debugLog queryPlanFile path is "
+
queryPlanFile
);
client
=
SqpDubboClient
.
getProdImpl
();
loadQueryPlanFromFile
();
}
public
static
QueryPlanHelper
getInstance
()
{
if
(
instance
==
null
)
{
synchronized
(
QueryPlanHelper
.
class
)
{
if
(
instance
==
null
)
{
instance
=
new
QueryPlanHelper
();
}
}
}
return
instance
;
}
public
int
explainQueryWordCount
(
String
keyword
)
{
int
count
=
0
;
if
(
StringUtils
.
isNotBlank
(
keyword
))
{
if
(
keywordMap
.
containsKey
(
keyword
))
{
return
keywordMap
.
get
(
keyword
);
}
String
traceId
=
UUID
.
randomUUID
().
toString
();
Map
<
String
,
String
>
bucketInfo
=
new
HashMap
<>();
Buckets
bucket
=
new
Buckets
(
bucketInfo
);
String
cityCode
=
""
;
long
currDate
=
0L
;
int
needSpell
=
0
;
Explanations
explanations
=
client
.
explain
(
traceId
,
bucket
,
cityCode
,
currDate
,
needSpell
,
keyword
,
null
);
if
(
explanations
!=
null
&&
explanations
.
getItems
().
size
()
>
0
&&
explanations
.
getItems
().
get
(
0
)
!=
null
)
{
Explanation
explanation
=
explanations
.
getItems
().
get
(
0
);
if
(
explanation
.
getQueryWords
()
!=
null
)
{
int
wordCount
=
explanation
.
getQueryWords
().
size
();
keywordMap
.
put
(
keyword
,
wordCount
);
newWordLines
.
add
(
keyword
+
","
+
wordCount
+
","
+
(
System
.
currentTimeMillis
()
/
1000
)
);
return
wordCount
;
}
}
}
return
count
;
}
private
void
loadQueryPlanFromFile
()
{
if
(
StringUtils
.
isNotBlank
(
queryPlanFile
))
{
List
<
String
>
lines
=
FileUtils
.
readLines
(
queryPlanFile
);
if
(
lines
!=
null
&&
lines
.
size
()
>
0
)
{
for
(
String
line
:
lines
)
{
if
(
StringUtils
.
isBlank
(
line
))
{
continue
;
}
String
[]
arr
=
line
.
split
(
","
);
if
(
arr
.
length
==
3
)
{
String
keyword
=
arr
[
0
];
String
strWordCount
=
arr
[
1
];
String
ts
=
arr
[
2
];
if
(
StringUtils
.
isNotBlank
(
keyword
)
&&
StringUtils
.
isNumber
(
strWordCount
)
&&
StringUtils
.
isNumber
(
ts
))
{
try
{
long
timeStamp
=
Long
.
valueOf
(
ts
);
if
(
timeStamp
<
minTimeStamp
)
{
minTimeStamp
=
timeStamp
;
}
int
wordCount
=
Integer
.
valueOf
(
strWordCount
);
keywordMap
.
put
(
keyword
,
wordCount
);
}
catch
(
Exception
e
)
{
LOG
.
info
(
"string to integer exception,"
,
e
);
}
}
}
}
}
}
}
public
void
writeQueryPlanToFile
()
{
if
(
StringUtils
.
isBlank
(
queryPlanFile
))
{
return
;
}
long
nowSecond
=
System
.
currentTimeMillis
()/
1000
;
long
sevenDays
=
3600
*
24
*
7
;
if
(
nowSecond
-
minTimeStamp
>
sevenDays
)
{
// 文件中最早的时间戳超过7天,全量更新;否则只更新新增的
if
(
keywordMap
.
size
()
>
0
)
{
newWordLines
=
new
ArrayList
<>();
// map转存到newWordLines
for
(
Map
.
Entry
<
String
,
Integer
>
entry
:
keywordMap
.
entrySet
())
{
String
line
=
entry
.
getKey
()
+
","
+
entry
.
getValue
()
+
","
+
nowSecond
;
newWordLines
.
add
(
line
);
}
}
}
if
(
newWordLines
!=
null
&&
newWordLines
.
size
()
>
0
)
{
FileUtils
.
saveToFile
(
newWordLines
,
queryPlanFile
,
true
);
newWordLines
=
new
ArrayList
<>();
}
}
public
Explanation
explain
(
String
keyword
)
{
if
(
StringUtils
.
isNotBlank
(
keyword
))
{
if
(
sqpCache
.
containsKey
(
keyword
))
{
return
sqpCache
.
get
(
keyword
);
}
String
traceId
=
UUID
.
randomUUID
().
toString
();
Map
<
String
,
String
>
bucketInfo
=
new
HashMap
<>();
Buckets
bucket
=
new
Buckets
(
bucketInfo
);
String
cityCode
=
""
;
long
currDate
=
0L
;
int
needSpell
=
0
;
Explanations
explanations
=
client
.
explain
(
traceId
,
bucket
,
cityCode
,
currDate
,
needSpell
,
keyword
,
null
);
if
(
explanations
!=
null
&&
explanations
.
getItems
().
size
()
>
0
&&
explanations
.
getItems
().
get
(
0
)
!=
null
)
{
Explanation
explanation
=
explanations
.
getItems
().
get
(
0
);
cacheKeyword
(
keyword
,
explanation
);
return
explanation
;
}
}
return
null
;
}
private
void
cacheKeyword
(
String
keyword
,
Explanation
explanation
)
{
sqpCache
.
put
(
keyword
,
explanation
);
wordList
.
add
(
keyword
);
if
(
sqpCache
.
size
()
>
100000
)
{
String
removeWord
=
wordList
.
get
(
0
);
wordList
.
remove
(
0
);
sqpCache
.
remove
(
removeWord
);
}
}
public
static
void
main
(
String
[]
arg
)
{
Map
<
String
,
String
>
cache
=
new
HashMap
<>();
List
<
String
>
list
=
new
ArrayList
<>();
for
(
int
i
=
0
;
i
<
7
;
i
++)
{
cache
.
put
(
"key_"
+
i
,
String
.
valueOf
(
i
));
list
.
add
(
"key_"
+
i
);
if
(
cache
.
size
()>
5
)
{
String
rk
=
list
.
get
(
0
);
list
.
remove
(
0
);
cache
.
remove
(
rk
);
}
System
.
out
.
println
(
"list size:"
+
list
.
size
()+
";map size:"
+
cache
.
size
());
}
}
}
suggest-task/src/main/java/com/secoo/so/suggest/helper/WordHelper.java
0 → 100644
View file @
099f58c5
package
com
.
secoo
.
so
.
suggest
.
helper
;
import
com.secoo.so.suggest.config.ConfigUtil
;
import
com.secoo.so.suggest.util.FileUtils
;
import
com.secoo.so.suggest.util.StringUtils
;
import
org.slf4j.Logger
;
import
org.slf4j.LoggerFactory
;
import
java.util.Arrays
;
import
java.util.HashSet
;
import
java.util.List
;
import
java.util.Set
;
/**
* @author wangmingfan
* @date 2022/8/8
* @description
*/
public
class
WordHelper
{
private
static
final
Logger
LOG
=
LoggerFactory
.
getLogger
(
WordHelper
.
class
);
private
static
final
String
specialWordFile
=
"specialWordPath"
;
private
static
final
String
synonymTagFile
=
"synonymTagPath"
;
/**
* 从文件中加载部分特定不进行分割的词
* 文件中每行一个词语
* @return java.util.Set<java.lang.String>
* @date 2022/8/8
*/
public
static
Set
<
String
>
loadSpecialWords
()
{
String
specialWordPath
=
ConfigUtil
.
getString
(
specialWordFile
);
LOG
.
info
(
"debugLog specialWordFile path is "
+
specialWordPath
);
Set
<
String
>
words
=
new
HashSet
<>();
if
(
StringUtils
.
isNotBlank
(
specialWordPath
))
{
List
<
String
>
lines
=
FileUtils
.
readLines
(
specialWordPath
);
if
(
lines
!=
null
&&
lines
.
size
()
>
0
)
{
words
.
addAll
(
lines
);
}
}
return
words
;
}
/**
* 从文件中加载同义标签
* 文件中每行表示一组同义词,同一组同义词之间用竖线(|)隔开,如:
* 男款|男式|男士
* 女款|女式|女士
* @return java.util.Set<java.util.Set<java.lang.String>>
* @date 2022/8/8
*/
public
static
Set
<
Set
<
String
>>
loadSynonymTags
()
{
String
synonymTagPath
=
ConfigUtil
.
getString
(
synonymTagFile
);
LOG
.
info
(
"debugLog synonymTagFile path is "
+
synonymTagPath
);
Set
<
Set
<
String
>>
synonyms
=
new
HashSet
<>();
if
(
StringUtils
.
isNotBlank
(
synonymTagPath
))
{
List
<
String
>
lines
=
FileUtils
.
readLines
(
synonymTagPath
);
lines
.
stream
().
filter
(
line
->
StringUtils
.
isNotBlank
(
line
)).
forEach
(
line
->
{
String
[]
arrWords
=
line
.
split
(
"\\|"
);
if
(
arrWords
.
length
>
0
)
{
Set
<
String
>
words
=
new
HashSet
<>(
Arrays
.
asList
(
arrWords
));
synonyms
.
add
(
words
);
}
});
}
return
synonyms
;
}
}
suggest-task/src/main/java/com/secoo/so/suggest/task/SuggestTask.java
View file @
099f58c5
package
com
.
secoo
.
so
.
suggest
.
task
;
package
com
.
secoo
.
so
.
suggest
.
task
;
import
com.alibaba.fastjson.JSON
;
import
com.alibaba.fastjson.JSON
;
import
com.alibaba.fastjson.JSONObject
;
import
com.secoo.so.suggest.config.ConfigUtil
;
import
com.secoo.so.suggest.config.ConfigUtil
;
import
com.secoo.so.suggest.db.DwDataSource
;
import
com.secoo.so.suggest.db.DwDataSource
;
import
com.secoo.so.suggest.db.ErpDataSource
;
import
com.secoo.so.suggest.db.ErpDataSource
;
...
@@ -10,6 +11,8 @@ import com.secoo.so.suggest.entity.EsSuggestKeywordInfo;
...
@@ -10,6 +11,8 @@ import com.secoo.so.suggest.entity.EsSuggestKeywordInfo;
import
com.secoo.so.suggest.entity.SearchKeywordInfo
;
import
com.secoo.so.suggest.entity.SearchKeywordInfo
;
import
com.secoo.so.suggest.es.EsClient
;
import
com.secoo.so.suggest.es.EsClient
;
import
com.secoo.so.suggest.es.EsObject
;
import
com.secoo.so.suggest.es.EsObject
;
import
com.secoo.so.suggest.helper.QueryPlanHelper
;
import
com.secoo.so.suggest.helper.WordHelper
;
import
com.secoo.so.suggest.util.*
;
import
com.secoo.so.suggest.util.*
;
import
lombok.Data
;
import
lombok.Data
;
import
lombok.extern.slf4j.Slf4j
;
import
lombok.extern.slf4j.Slf4j
;
...
@@ -36,6 +39,10 @@ public class SuggestTask {
...
@@ -36,6 +39,10 @@ public class SuggestTask {
private
static
int
maxTagSize
=
5
;
private
static
int
maxTagSize
=
5
;
private
static
long
startTime
=
System
.
currentTimeMillis
();
private
static
long
startTime
=
System
.
currentTimeMillis
();
private
static
Set
<
String
>
spWordSet
=
new
HashSet
<>();
private
static
Set
<
Set
<
String
>>
synonymList
=
new
HashSet
<>();
private
static
Set
<
String
>
ignoreWordSet
=
new
HashSet
<>(
Arrays
.
asList
(
"系列"
,
"型号"
,
"款式"
,
"风格"
));
public
static
void
main
(
String
[]
args
)
{
public
static
void
main
(
String
[]
args
)
{
startTime
=
System
.
currentTimeMillis
();
startTime
=
System
.
currentTimeMillis
();
log
.
info
(
">>>>>>>>>>>> start run SuggestTask , startTime: "
+
startTime
);
log
.
info
(
">>>>>>>>>>>> start run SuggestTask , startTime: "
+
startTime
);
...
@@ -55,10 +62,19 @@ public class SuggestTask {
...
@@ -55,10 +62,19 @@ public class SuggestTask {
sensitiveMap
=
loadSensitiveMap
();
sensitiveMap
=
loadSensitiveMap
();
europeWordMap
=
loadEuropeWordMap
();
europeWordMap
=
loadEuropeWordMap
();
// 加载部分确定不能分割的特殊词
spWordSet
=
loadSpecialWords
();
// 加载标签同义词
synonymList
=
loadTagSynonym
();
QueryPlanHelper
sqp
=
QueryPlanHelper
.
getInstance
();
// 加载搜索词并处理
// 加载搜索词并处理
processSuggestTask
(
startTime
);
processSuggestTask
(
startTime
);
log
.
info
(
"<<<<<<<<<<<< end run SuggestTask, startTime: {} , cost: {}ms"
,
startTime
,
(
System
.
currentTimeMillis
()
-
startTime
)
);
log
.
info
(
"<<<<<<<<<<<< end run SuggestTask, startTime: {} , cost: {}ms"
,
startTime
,
(
System
.
currentTimeMillis
()
-
startTime
)
);
System
.
exit
(
0
);
}
}
private
static
Map
<
String
,
Long
>
loadBrandMap
()
{
private
static
Map
<
String
,
Long
>
loadBrandMap
()
{
...
@@ -179,6 +195,33 @@ public class SuggestTask {
...
@@ -179,6 +195,33 @@ public class SuggestTask {
return
prefixFilterList
;
return
prefixFilterList
;
}
}
private
static
Set
<
Set
<
String
>>
loadTagSynonym
(){
Set
<
Set
<
String
>>
synSet
=
new
HashSet
<>();
Set
<
String
>
maleWords
=
new
HashSet
<>(
Arrays
.
asList
(
"男性"
,
"男式"
,
"男士"
,
"男款"
,
"男"
));
Set
<
String
>
femaleWords
=
new
HashSet
<>(
Arrays
.
asList
(
"女性"
,
"女式"
,
"女士"
,
"女款"
,
"女"
));
synSet
.
add
(
maleWords
);
synSet
.
add
(
femaleWords
);
Set
<
Set
<
String
>>
fileSynonyms
=
WordHelper
.
loadSynonymTags
();
if
(
fileSynonyms
.
size
()
>
0
)
{
synSet
.
addAll
(
fileSynonyms
);
}
log
.
info
(
"debugLog synonym count:"
+
synSet
.
size
());
return
synSet
;
}
private
static
Set
<
String
>
loadSpecialWords
(){
Set
<
String
>
baseSet
=
new
HashSet
<>(
Arrays
.
asList
(
"靴子"
,
"鞋子"
,
"裤子"
,
"袜子"
,
"裙子"
,
"帽子"
,
"杯子"
,
"箱子"
,
"包包"
,
"包袋"
,
"包带"
,
"表带"
,
"大号"
,
"中号"
,
"小号"
,
"衣服"
,
"t恤"
,
"衣服"
,
"男款"
,
"男士"
,
"男式"
,
"男性"
,
"男童"
,
"女款"
,
"女士"
,
"女式"
,
"女性"
,
"女童"
,
"大象"
,
"男包"
,
"女包"
,
"男鞋"
,
"女鞋"
));
Set
<
String
>
fileWords
=
WordHelper
.
loadSpecialWords
();
if
(
fileWords
.
size
()
>
0
)
{
baseSet
.
addAll
(
fileWords
);
}
log
.
info
(
"debugLog specialWords count:"
+
baseSet
.
size
());
return
baseSet
;
}
private
static
String
cleanKeyword
(
String
keyword
)
{
private
static
String
cleanKeyword
(
String
keyword
)
{
if
(
keyword
!=
null
)
{
if
(
keyword
!=
null
)
{
String
fixKeyword
=
PinYinUtils
.
convertToSimplifiedChinese
(
keyword
);
String
fixKeyword
=
PinYinUtils
.
convertToSimplifiedChinese
(
keyword
);
...
@@ -244,6 +287,8 @@ public class SuggestTask {
...
@@ -244,6 +287,8 @@ public class SuggestTask {
}
}
}
}
ConcurrentHashMap
<
String
,
EsSuggestKeywordInfo
>
tmpSuggestKeywordMap
=
new
ConcurrentHashMap
<>();
// 过滤词
// 过滤词
List
<
EsSuggestKeywordInfo
>
suggestKeywordInfoList
=
new
ArrayList
<>();
List
<
EsSuggestKeywordInfo
>
suggestKeywordInfoList
=
new
ArrayList
<>();
int
processCount
=
0
;
int
processCount
=
0
;
...
@@ -261,6 +306,7 @@ public class SuggestTask {
...
@@ -261,6 +306,7 @@ public class SuggestTask {
cleanBeforeSaveToEs
(
suggestKeywordInfo
);
cleanBeforeSaveToEs
(
suggestKeywordInfo
);
suggestKeywordInfoList
.
add
(
suggestKeywordInfo
);
suggestKeywordInfoList
.
add
(
suggestKeywordInfo
);
tmpSuggestKeywordMap
.
put
(
suggestKeywordInfo
.
getKeyword
(),
suggestKeywordInfo
);
}
}
processCount
++;
processCount
++;
...
@@ -268,6 +314,8 @@ public class SuggestTask {
...
@@ -268,6 +314,8 @@ public class SuggestTask {
log
.
info
(
"keyword filter process: {} / {}"
,
processCount
,
totalCount
);
log
.
info
(
"keyword filter process: {} / {}"
,
processCount
,
totalCount
);
}
}
}
}
mergeKeywordTag
(
tmpSuggestKeywordMap
);
// 处理部分keyword,合并为其他词的tag
QueryPlanHelper
.
getInstance
().
writeQueryPlanToFile
();
if
(
"true"
.
equalsIgnoreCase
(
System
.
getProperty
(
"suggest.saveToFile"
)))
{
if
(
"true"
.
equalsIgnoreCase
(
System
.
getProperty
(
"suggest.saveToFile"
)))
{
// save to file
// save to file
...
@@ -278,6 +326,162 @@ public class SuggestTask {
...
@@ -278,6 +326,162 @@ public class SuggestTask {
}
}
}
}
private
static
void
mergeKeywordTag
(
ConcurrentHashMap
<
String
,
EsSuggestKeywordInfo
>
esSuggestKeywordMap
)
{
int
maxCount
=
10000
;
List
<
String
>
keyList
=
new
ArrayList
<>(
esSuggestKeywordMap
.
keySet
());
Collections
.
sort
(
keyList
);
// 按照字符排序,确保扩展词都在本词后面
for
(
int
i
=
0
;
i
<
keyList
.
size
();
i
++)
{
String
word
=
keyList
.
get
(
i
);
EsSuggestKeywordInfo
suggestInfo
=
esSuggestKeywordMap
.
get
(
word
);
if
(
suggestInfo
==
null
)
{
continue
;
}
int
wordLen
=
StringUtils
.
getByteLength
(
word
);
int
length
=
word
.
length
();
if
(
wordLen
>
3
&&
wordLen
<=
18
&&
StringUtils
.
isBlank
(
suggestInfo
.
getSuggestTags
()))
{
List
<
EsSuggestKeywordInfo
>
suggestList
=
new
ArrayList
<>();
int
keyCount
=
0
;
// 获取包含word的suggest,满足条件的存入suggestList列表,限定最多100个词
for
(
int
j
=
i
+
1
;
j
<
keyList
.
size
();
j
++)
{
if
(
keyCount
>
maxCount
)
{
break
;
}
String
fullWord
=
keyList
.
get
(
j
);
EsSuggestKeywordInfo
tmpSuggest
=
esSuggestKeywordMap
.
get
(
fullWord
);
if
(
StringUtils
.
isNotBlank
(
tmpSuggest
.
getSuggestTags
()))
{
continue
;
}
int
rightLen
=
StringUtils
.
getByteLength
(
fullWord
)
-
wordLen
;
if
(
fullWord
.
startsWith
(
word
))
{
if
(
rightLen
>
3
&&
rightLen
<=
14
)
{
if
(
isSkipMergeTag
(
word
,
fullWord
))
{
continue
;
}
suggestList
.
add
(
esSuggestKeywordMap
.
get
(
fullWord
));
keyCount
++;
}
}
else
{
break
;
}
}
// 对suggestList按照评分排序,取前3个合并在一起做为word的tag
if
(
suggestList
.
size
()
>
0
)
{
Collections
.
sort
(
suggestList
,
(
t1
,
t2
)
->
{
Double
score1
=
t1
.
getWordABRank
();
Double
score2
=
t2
.
getWordABRank
();
if
(
score1
!=
null
&&
score2
!=
null
)
{
return
score2
.
compareTo
(
score1
);
}
else
if
(
score1
==
null
&&
score2
==
null
)
{
return
0
;
}
else
if
(
score1
==
null
)
{
return
1
;
}
else
{
return
-
1
;
}
});
Set
<
String
>
tagSet
=
new
HashSet
<>();
int
count
=
0
;
for
(
int
k
=
0
;
k
<
suggestList
.
size
();
k
++)
{
EsSuggestKeywordInfo
info
=
suggestList
.
get
(
k
);
String
fullWord
=
info
.
getKeyword
();
int
fulLen
=
fullWord
.
length
();
String
subWord
=
fullWord
.
substring
(
length
,
fulLen
).
trim
();
if
(
count
<
3
&&
!
isSkipSynonymTag
(
subWord
,
tagSet
))
{
tagSet
.
add
(
subWord
);
count
++;
}
}
if
(
tagSet
.
size
()
>
0
)
{
String
tags
=
String
.
join
(
","
,
tagSet
);
suggestInfo
.
setSuggestTags
(
tags
);
}
}
}
}
}
private
static
boolean
isSkipSynonymTag
(
String
subWord
,
Set
<
String
>
tagSet
)
{
if
(
StringUtils
.
isBlank
(
subWord
))
{
return
true
;
}
if
(
synonymList
.
size
()
>
0
)
{
for
(
Set
<
String
>
synonymSet
:
synonymList
)
{
if
(
synonymSet
.
contains
(
subWord
))
{
// 同义词不支持多组中有交叉内容,一旦发现一组,就会跳出循环
for
(
String
synonym
:
synonymSet
)
{
if
(
tagSet
.
contains
(
synonym
))
{
return
true
;
}
}
break
;
}
}
}
return
false
;
}
private
static
boolean
isSkipMergeTag
(
String
word
,
String
fullWord
)
{
if
(
StringUtils
.
isBlank
(
word
)
||
StringUtils
.
isBlank
(
fullWord
)
)
{
return
true
;
}
int
length
=
word
.
length
();
int
fullLen
=
fullWord
.
length
();
if
(
fullLen
<=
length
)
{
return
true
;
}
boolean
isWordEnStr
=
StringUtils
.
isEnAndMidSpaceStr
(
word
);
int
wordByteLen
=
StringUtils
.
getByteLength
(
word
);
if
(
wordByteLen
<=
3
||
wordByteLen
>
18
)
{
return
true
;
}
String
leftSubChar
=
word
.
substring
(
length
-
1
,
length
);
String
rightSubChar
=
fullWord
.
substring
(
length
,
length
+
1
);
String
rightWord
=
fullWord
.
substring
(
length
,
fullLen
).
trim
();
int
realByteLen
=
StringUtils
.
getByteLength
(
rightWord
);
if
(
realByteLen
<=
3
||
realByteLen
>
14
)
{
// 限制作为标签的长度
return
true
;
}
if
(
ignoreWordSet
.
contains
(
rightWord
))
{
return
true
;
}
boolean
isTShirt
=
false
;
if
(
rightWord
.
length
()
>=
2
&&
rightWord
.
toLowerCase
().
startsWith
(
"t恤"
))
{
isTShirt
=
true
;
}
if
(
isWordEnStr
&&
(
StringUtils
.
isEnStr
(
rightSubChar
)
&&
!
isTShirt
))
{
// 左边是英文和右边词语也是英文开头,认为是同一个英文词,但是T恤除外
return
true
;
}
if
(
StringUtils
.
isNumber
(
leftSubChar
+
rightSubChar
))
{
// 左右连接处都是数字时,不做tag
return
true
;
}
if
(
rightWord
.
startsWith
(
"色"
))
{
// 通常前面带有颜色词,分开的不做标签
return
true
;
}
if
(
spWordSet
.
contains
(
leftSubChar
+
rightSubChar
))
{
// 左边结尾和右边开头组成常见词语,不做标签
return
true
;
}
if
(
StringUtils
.
isNotBlank
(
rightWord
))
{
log
.
info
(
"check word:"
+
word
+
" and "
+
fullWord
);
QueryPlanHelper
sqp
=
QueryPlanHelper
.
getInstance
();
int
wordCount1
=
sqp
.
explainQueryWordCount
(
word
);
int
wordCount2
=
sqp
.
explainQueryWordCount
(
rightWord
);
int
wordCount3
=
sqp
.
explainQueryWordCount
(
fullWord
);
if
(
wordCount1
+
wordCount2
>
wordCount3
)
{
return
true
;
}
}
else
{
return
true
;
}
return
false
;
}
private
static
EsSuggestKeywordInfo
buildDefaultEsSuggestKeywordInfo
(
String
keyword
)
{
private
static
EsSuggestKeywordInfo
buildDefaultEsSuggestKeywordInfo
(
String
keyword
)
{
EsSuggestKeywordInfo
esSuggestKeywordInfo
=
new
EsSuggestKeywordInfo
();
EsSuggestKeywordInfo
esSuggestKeywordInfo
=
new
EsSuggestKeywordInfo
();
esSuggestKeywordInfo
.
setKeyword
(
keyword
);
esSuggestKeywordInfo
.
setKeyword
(
keyword
);
...
...
suggest-task/src/main/java/com/secoo/so/suggest/util/StringUtils.java
View file @
099f58c5
...
@@ -6,6 +6,7 @@ import com.alibaba.fastjson.JSONObject;
...
@@ -6,6 +6,7 @@ import com.alibaba.fastjson.JSONObject;
import
java.io.UnsupportedEncodingException
;
import
java.io.UnsupportedEncodingException
;
import
java.math.BigDecimal
;
import
java.math.BigDecimal
;
import
java.nio.channels.Pipe
;
import
java.nio.charset.Charset
;
import
java.nio.charset.Charset
;
import
java.nio.charset.StandardCharsets
;
import
java.nio.charset.StandardCharsets
;
import
java.security.MessageDigest
;
import
java.security.MessageDigest
;
...
@@ -2067,6 +2068,17 @@ public abstract class StringUtils {
...
@@ -2067,6 +2068,17 @@ public abstract class StringUtils {
}
}
/**
/**
* 判断是否是英文字符串,两边有空格认为不是英文,包含在中间的空格认为是英文
*/
public
static
boolean
isEnAndMidSpaceStr
(
String
word
)
{
if
(
word
.
startsWith
(
" "
)
||
word
.
endsWith
(
" "
))
{
return
false
;
}
boolean
result
=
word
.
trim
().
matches
(
"[a-zA-Z\\s]+"
);
return
result
;
}
/**
* 判断是否包含中文
* 判断是否包含中文
*/
*/
public
static
boolean
isContainChStr
(
String
word
)
{
public
static
boolean
isContainChStr
(
String
word
)
{
...
@@ -2109,6 +2121,20 @@ public abstract class StringUtils {
...
@@ -2109,6 +2121,20 @@ public abstract class StringUtils {
return
0
;
return
0
;
}
}
public
static
void
main
(
String
[]
arg
)
{
Set
<
String
>
spWordSet
=
new
HashSet
<>(
Arrays
.
asList
(
"靴子"
,
"鞋子"
,
"裤子"
,
"袜子"
,
"裙子"
,
"帽子"
,
"杯子"
,
"箱子"
,
"包包"
,
"包袋"
,
"包袋"
));
String
word
=
"ab c "
;
String
word1
=
"ab c 中文"
;
int
wordLen
=
StringUtils
.
getByteLength
(
word
);
int
len
=
StringUtils
.
getByteLength
(
word1
);
String
aaa
=
word1
.
substring
(
wordLen
,
wordLen
+
1
);
String
bbb
=
word1
.
substring
(
wordLen
,
word1
.
length
());
System
.
out
.
println
(
isEnAndMidSpaceStr
(
word
));
System
.
out
.
println
(
isEnAndMidSpaceStr
(
bbb
));
System
.
out
.
println
(
aaa
);
System
.
out
.
println
(
bbb
);
}
/**
/**
* 32位md5加密
* 32位md5加密
*/
*/
...
@@ -2153,7 +2179,7 @@ public abstract class StringUtils {
...
@@ -2153,7 +2179,7 @@ public abstract class StringUtils {
*
*
* <p>If the stripChars String is {@code null}, whitespace is
* <p>If the stripChars String is {@code null}, whitespace is
* stripped as defined by {@link Character#isWhitespace(char)}.
* stripped as defined by {@link Character#isWhitespace(char)}.
* Alternatively use {@link #strip(String)}.</p>
* Alternatively use {@link #strip(String
, String
)}.</p>
*
*
* <pre>
* <pre>
* StringUtils.strip(null, *) = null
* StringUtils.strip(null, *) = null
...
...
suggest-task/src/main/profiles/prod/config.properties
View file @
099f58c5
...
@@ -14,3 +14,7 @@ suggestTask.es.password=search5z0NvEn1D
...
@@ -14,3 +14,7 @@ suggestTask.es.password=search5z0NvEn1D
suggestTask.es.index
=
search_suggest_index
suggestTask.es.index
=
search_suggest_index
suggestTask.es.type
=
search_suggest_type
suggestTask.es.type
=
search_suggest_type
suggestTask.es.batchSize
=
2000
suggestTask.es.batchSize
=
2000
queryPlan.cachePath
=
/data/crontab/suggest/tmp/queryplan.txt
specialWordPath
=
/data/crontab/suggest/dict/specialWord.txt
synonymTagPath
=
/data/crontab/suggest/dict/synonymTag.txt
suggest-task/src/main/profiles/test/config.properties
View file @
099f58c5
...
@@ -5,11 +5,16 @@ suggestTask.SensitiveFolder=/data/pssmaster/corpus_set/suggest_corpus/sensitive
...
@@ -5,11 +5,16 @@ suggestTask.SensitiveFolder=/data/pssmaster/corpus_set/suggest_corpus/sensitive
suggestTask.EuropeWordFolder
=
/data/pssmaster/corpus_set/suggest_corpus/europe_word
suggestTask.EuropeWordFolder
=
/data/pssmaster/corpus_set/suggest_corpus/europe_word
suggestTask.batchSize
=
10000
suggestTask.batchSize
=
10000
suggestTask.threadPoolSize
=
10
suggestTask.threadPoolSize
=
10
suggestTask.suggestTagMaxSize
=
5
suggestTask.searchWordWarningCount
=
1000000
suggestTask.searchWordWarningCount
=
1000000
suggestTask.es.url
=
http://10.0.254.139:9200
suggestTask.suggestTagMaxSize
=
5
suggestTask.es.user
=
suggest
suggestTask.warningPhones
=
13426233960
suggestTask.es.password
=
suggest456
suggestTask.es.url
=
http://bigdataescluster.secoolocal.com:9200
suggestTask.es.index
=
search_suggest_index
suggestTask.es.user
=
search
suggestTask.es.password
=
search5z0NvEn1D
suggestTask.es.index
=
search_suggest_index_huidu
suggestTask.es.type
=
search_suggest_type
suggestTask.es.type
=
search_suggest_type
suggestTask.es.batchSize
=
2000
suggestTask.es.batchSize
=
2000
\ No newline at end of file
queryPlan.cachePath
=
/data/crontab/test/tmp/queryplan.txt
specialWordPath
=
/data/crontab/test/dict/specialWord.txt
synonymTagPath
=
/data/crontab/test/dict/synonymTag.txt
\ No newline at end of file
suggest-task/src/main/profiles/test/db.properties
View file @
099f58c5
erp.read.url
=
jdbc:mysql://1
0.4.3.223
:3306/secooErpDB?useUnicode=true&characterEncoding=utf8&noAccessToProcedureBodies=true&zeroDateTimeBehavior=convertToNull&allowMultiQueries=true
erp.read.url
=
jdbc:mysql://1
92.168.50.40
:3306/secooErpDB?useUnicode=true&characterEncoding=utf8&noAccessToProcedureBodies=true&zeroDateTimeBehavior=convertToNull&allowMultiQueries=true
erp.read.user
=
3306_test
erp.read.user
=
so_Erp_R
erp.read.password
=
iS6CXpYqgZ8Mhjui
erp.read.password
=
5RgzudyyFlApTmve
seo.read.url
=
jdbc:mysql://10.4.3.223:3306/secooSeoDB?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull
seo.read.url
=
jdbc:mysql://secooSeoDB.master.com:3307/secooSeoDB?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull
seo.read.user
=
SeoDB_test
seo.read.user
=
sem_Seo_W
seo.read.password
=
Cxkfq57huej0fTpK
seo.read.password
=
C2IiHfNKYpT1onsR
\ No newline at end of file
dw.read.url
=
jdbc:mysql://secooDataWarehouse.slave.com:3306/secooDataWarehouse?useUnicode=true&characterEncoding=utf8&zeroDateTimeBehavior=convertToNull
dw.read.user
=
Search_DataWar_R
dw.read.password
=
pY1P9zUj9x1M65ot5szo
\ No newline at end of file
suggest-task/src/main/resources/specialWord示例.txt
0 → 100644
View file @
099f58c5
手提
手提
提包
\ No newline at end of file
suggest-task/src/main/resources/synonymTag示例.txt
0 → 100644
View file @
099f58c5
皮夹|钱包
皮夹|钱包
围脖|围巾
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment