Commit af4c8ed7 by zhaoyanchao

更改品牌品类优先,避免误识别为无结果词

parent ed50608a
...@@ -70,6 +70,8 @@ var t2s, _ = gocc.New("t2s") ...@@ -70,6 +70,8 @@ var t2s, _ = gocc.New("t2s")
var prefixFilterArr = []string{"https://", "http://", "dg", "d & g", "dolce&gabbana", var prefixFilterArr = []string{"https://", "http://", "dg", "d & g", "dolce&gabbana",
"dolce & gabbana", "杜嘉班纳", "避孕", "情趣", "cucci", "乒乓球", "cuccl", "gucii","tod's","iwc7" } "dolce & gabbana", "杜嘉班纳", "避孕", "情趣", "cucci", "乒乓球", "cuccl", "gucii","tod's","iwc7" }
const TABLE_SPLIT_STEP_SIZE = 10000 const TABLE_SPLIT_STEP_SIZE = 10000
const MAX_TAG_SIZE = 5 const MAX_TAG_SIZE = 5
...@@ -78,6 +80,7 @@ var UPDATE_TIME = time.Now().UnixNano() / 1e6 ...@@ -78,6 +80,7 @@ var UPDATE_TIME = time.Now().UnixNano() / 1e6
func main() { func main() {
startTime := time.Now() startTime := time.Now()
log.SetFlags(log.Lshortfile | log.LstdFlags)
datawareDB, err := sql.Open("mysql", RUN_ENV.DataWareDB) datawareDB, err := sql.Open("mysql", RUN_ENV.DataWareDB)
...@@ -180,6 +183,7 @@ func cleanForEs(w *Word) { ...@@ -180,6 +183,7 @@ func cleanForEs(w *Word) {
func addWord(w *Word, processor *elastic.BulkProcessor) { func addWord(w *Word, processor *elastic.BulkProcessor) {
processWord(w) processWord(w)
if !isFilterWord(w) { if !isFilterWord(w) {
wordMap.Store(w.Keyword,w) wordMap.Store(w.Keyword,w)
cleanForEs(w) cleanForEs(w)
...@@ -305,7 +309,6 @@ func queryIndex(idFlag int, db *sql.DB, bulkProcessor *elastic.BulkProcessor, wg ...@@ -305,7 +309,6 @@ func queryIndex(idFlag int, db *sql.DB, bulkProcessor *elastic.BulkProcessor, wg
results, err := db.Query(sqlStr) results, err := db.Query(sqlStr)
if err != nil { log.Print(err.Error()) } if err != nil { log.Print(err.Error()) }
log.Print("read database success ") log.Print("read database success ")
for results.Next() { for results.Next() {
...@@ -472,25 +475,41 @@ func processWord(w *Word) { ...@@ -472,25 +475,41 @@ func processWord(w *Word) {
func isFilterWord(w *Word) bool { func isFilterWord(w *Word) bool {
// 品牌词 类目词 人工干预词 不做过滤
if w.IsBrand || w.IsCategory || w.IsManual {
w.IsSensitive = false
return false
}
// 敏感词过滤 // 敏感词过滤
if w.IsSensitive { return true } if w.IsSensitive {
return true
}
// 过滤掉太长的词 每个中文字占3个byte // 过滤掉太长的词 每个中文字占3个byte
if utf8.RuneCountInString(w.Keyword) <= 1 || len(w.Keyword) > 50 { return true } if utf8.RuneCountInString(w.Keyword) <= 1 || len(w.Keyword) > 50 {
return true
}
// 过滤掉商品id,商品id是有7位数字组成 // 过滤掉商品id,商品id是有7位数字组成
if len(w.Keyword) > 6 && isAllDigit(w.Keyword) { return true } if len(w.Keyword) > 6 && isAllDigit(w.Keyword) {
return true
}
// 前缀过滤 // 前缀过滤
for _, v := range prefixFilterArr { for _, v := range prefixFilterArr {
if strings.HasPrefix(w.Keyword, v) { return true } if strings.HasPrefix(w.Keyword, v) {
return true
}
} }
// 品牌词 类目词 人工干预词 不做过滤
if w.IsBrand || w.IsCategory || w.IsManual { return false }
// 年数据过滤 // 年数据过滤
if w.YearCount < 2 || w.YearClickCount < 2 { return true } if w.YearCount < 2 || w.YearClickCount < 2 {
return true
}
// 判断是否是热搜词 一年内搜索次数大于50或者一周内搜索次数大于5 // 判断是否是热搜词 一年内搜索次数大于50或者一周内搜索次数大于5
if isHotSearchWord(w) { if isHotSearchWord(w) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment