Unverified Commit 6abf6e24 by David Star Committed by GitHub

Merge pull request #2 from yanchaosb123/rank_opt

敏感词放后
parents bca1a519 af4c8ed7
...@@ -70,9 +70,9 @@ var t2s, _ = gocc.New("t2s") ...@@ -70,9 +70,9 @@ var t2s, _ = gocc.New("t2s")
var prefixFilterArr = []string{"https://", "http://", "dg", "d & g", "dolce&gabbana", var prefixFilterArr = []string{"https://", "http://", "dg", "d & g", "dolce&gabbana",
"dolce & gabbana", "杜嘉班纳", "避孕", "情趣", "cucci", "乒乓球", "cuccl", "gucii","tod's","iwc7" } "dolce & gabbana", "杜嘉班纳", "避孕", "情趣", "cucci", "乒乓球", "cuccl", "gucii","tod's","iwc7" }
const TABLE_SPLIT_STEP_SIZE = 10000
const LEVEL_SIZE = 1
const TABLE_SPLIT_STEP_SIZE = 10000
const MAX_TAG_SIZE = 5 const MAX_TAG_SIZE = 5
...@@ -80,6 +80,7 @@ var UPDATE_TIME = time.Now().UnixNano() / 1e6 ...@@ -80,6 +80,7 @@ var UPDATE_TIME = time.Now().UnixNano() / 1e6
func main() { func main() {
startTime := time.Now() startTime := time.Now()
log.SetFlags(log.Lshortfile | log.LstdFlags)
datawareDB, err := sql.Open("mysql", RUN_ENV.DataWareDB) datawareDB, err := sql.Open("mysql", RUN_ENV.DataWareDB)
...@@ -117,7 +118,7 @@ func main() { ...@@ -117,7 +118,7 @@ func main() {
count := arr[1] / TABLE_SPLIT_STEP_SIZE count := arr[1] / TABLE_SPLIT_STEP_SIZE
log.Printf("maxId/10000=%d\n", count) log.Printf("maxId/10000=%d\n", count)
if arr[1] < 2800000 { if arr[1] < 1000000 {
log.Printf("data is too little ,return") log.Printf("data is too little ,return")
sendSuggestNotify() sendSuggestNotify()
return return
...@@ -182,6 +183,7 @@ func cleanForEs(w *Word) { ...@@ -182,6 +183,7 @@ func cleanForEs(w *Word) {
func addWord(w *Word, processor *elastic.BulkProcessor) { func addWord(w *Word, processor *elastic.BulkProcessor) {
processWord(w) processWord(w)
if !isFilterWord(w) { if !isFilterWord(w) {
wordMap.Store(w.Keyword,w) wordMap.Store(w.Keyword,w)
cleanForEs(w) cleanForEs(w)
...@@ -307,7 +309,6 @@ func queryIndex(idFlag int, db *sql.DB, bulkProcessor *elastic.BulkProcessor, wg ...@@ -307,7 +309,6 @@ func queryIndex(idFlag int, db *sql.DB, bulkProcessor *elastic.BulkProcessor, wg
results, err := db.Query(sqlStr) results, err := db.Query(sqlStr)
if err != nil { log.Print(err.Error()) } if err != nil { log.Print(err.Error()) }
log.Print("read database success ") log.Print("read database success ")
for results.Next() { for results.Next() {
...@@ -467,30 +468,48 @@ func processWord(w *Word) { ...@@ -467,30 +468,48 @@ func processWord(w *Word) {
calculateWordRank(w) calculateWordRank(w)
calculateWordABRank(w) calculateWordABRank(w)
addNewScoreIfNewHotWord(w)
} }
func isFilterWord(w *Word) bool { func isFilterWord(w *Word) bool {
// 品牌词 类目词 人工干预词 不做过滤
if w.IsBrand || w.IsCategory || w.IsManual {
w.IsSensitive = false
return false
}
// 敏感词过滤 // 敏感词过滤
if w.IsSensitive { return true } if w.IsSensitive {
return true
}
// 过滤掉太长的词 每个中文字占3个byte // 过滤掉太长的词 每个中文字占3个byte
if utf8.RuneCountInString(w.Keyword) <= 1 || len(w.Keyword) > 50 { return true } if utf8.RuneCountInString(w.Keyword) <= 1 || len(w.Keyword) > 50 {
return true
}
// 过滤掉商品id,商品id是有7位数字组成 // 过滤掉商品id,商品id是有7位数字组成
if len(w.Keyword) > 6 && isAllDigit(w.Keyword) { return true } if len(w.Keyword) > 6 && isAllDigit(w.Keyword) {
return true
}
// 前缀过滤 // 前缀过滤
for _, v := range prefixFilterArr { for _, v := range prefixFilterArr {
if strings.HasPrefix(w.Keyword, v) { return true } if strings.HasPrefix(w.Keyword, v) {
return true
}
} }
// 品牌词 类目词 人工干预词 不做过滤
if w.IsBrand || w.IsCategory || w.IsManual { return false }
// 年数据过滤 // 年数据过滤
if w.YearCount < 2 || w.YearClickCount < 2 { return true } if w.YearCount < 2 || w.YearClickCount < 2 {
return true
}
// 判断是否是热搜词 一年内搜索次数大于50或者一周内搜索次数大于5 // 判断是否是热搜词 一年内搜索次数大于50或者一周内搜索次数大于5
if isHotSearchWord(w) { if isHotSearchWord(w) {
...@@ -502,6 +521,29 @@ func isFilterWord(w *Word) bool { ...@@ -502,6 +521,29 @@ func isFilterWord(w *Word) bool {
} }
} }
func addNewScoreIfNewHotWord(w *Word) {
if w == nil {
return
}
// 比例有意义
if w.WeekCount == 0 || w.YearCount == 0 || w.WeekCount < 20 {
return
}
// 周点击占年点击 40% 以上
if w.WeekCount *10 / w.YearCount <= 5 {
return
}
if w.WeekClickCount < 3 || w.WeekUv < 5 {
return
}
// 新词加分大小 类似于 人工干预值
w.WordABRank = w.WordABRank * math.Sqrt(5.0)
fmt.Printf("最新热词添加分数,新词: %s", w.Keyword )
}
func isAllDigit(str string) bool { func isAllDigit(str string) bool {
for _, x := range str { for _, x := range str {
// x 的类型是 rune 其实就是对应字符的 utf8 编码 // x 的类型是 rune 其实就是对应字符的 utf8 编码
......
package main package main
import ( import "fmt"
"math"
"strings" type Phone interface {
"fmt" call()
)
type B struct {
Keyword string `json:"keyword"`
KeywordPinYin string `json:"keywordPinYin"`
YearCount int32 `json:"yearCount"`
YearClickCount int32 `json:"yearClickCount"`
YearCartCount int32 `json:"yearCartCount"`
ZhaoCount int32 `json:"-"`
} }
func main() {
prefix := strings.HasPrefix("tod's", "tod's") type NokiaPhone struct {
fmt.Print(prefix) Name string
}
func (nokiaPhone *NokiaPhone) call() {
fmt.Print(nokiaPhone.Name)
} }
//
//func (nokiaPhone *NokiaPhone) call() {
// fmt.Print(nokiaPhone.Name)
//}
func main() {
func calculateRatioFactor2(ratio float64, count int32) float64 {
var rank float64 var phone = NokiaPhone{Name:"zhangsan"}
switch { phone.call()
case count > 1 && count < 10 : rank = 1.2
case count >= 10 && count < 20 : rank = 1.4
case count >= 20 && count < 50 : rank = 1.6
case count >= 50 && count < 100 : rank = 1.8
case count >= 100 && count < 200 : rank = 2.0
case count >= 200 && count < 500 : rank = 2.2
case count >= 500 : rank = 2.5
default:rank = 1.0
}
//根据搜索转化率,转换为热度因子
return math.Log10(math.Sqrt(ratio + 10)) * rank
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment