Unverified Commit bca1a519 by David Star Committed by GitHub

Merge pull request #1 from yanchaosb123/rank_opt

算分优化
parents a59ff1f5 33b7d636
package main
import (
"strings"
"net/http"
"time"
"encoding/json"
"bytes"
"io/ioutil"
"container/list"
"strings"
"github.com/mozillazg/go-pinyin"
"fmt"
"strconv"
"fmt"
)
type ENV struct {
......@@ -16,6 +22,13 @@ type ENV struct {
ManualFolder string
SensitiveFolder string
}
type Message struct {
Title string
Phones *list.List
Body *list.List
}
var test_env = &ENV{
DataWareDB: "DataWarehouse_test:FihdZW7o1XKtDETZexOG@tcp(test01-secooDataWarehouse.master.com:3306)/secooDataWarehouse",
ErpDB: "3306_test:iS6CXpYqgZ8Mhjui@tcp(10.4.3.223:3306)/secooErpDB",
......@@ -37,12 +50,28 @@ var prod_env = &ENV {
SensitiveFolder: "/data/pssmaster/corpus_set/suggest_corpus/sensitive"}
// 重要,该参数 确定是 正式还是 测试环境
var RUN_ENV = test_env
var RUN_ENV = prod_env
/************************* 下面是 util 方法 *****************************/
var CH_EN_PUNC = map[string]string {
",":",",
"。":".",
"!":"!",
"?":"?",
"【":"[",
"】":"]",
"(":"(",
")":")",
"‘":"'",
"’":"'",
"“":"\"",
"”":"\"",
}
func convertToPinyin(str string) string {
var ret string
for _, v := range str {
......@@ -62,7 +91,7 @@ func convertToPinyin(str string) string {
func cleanKeyword(keyword string) string {
out, err := t2s.Convert(keyword)
if err != nil { fmt.Println(err) }
keyword = strings.ToLower(strings.Trim(DBC2SBC(strings.TrimSpace(out)),"\ufffc|,"))
keyword = strings.ToLower(strings.Trim(DBC2SBC(strings.TrimSpace(out)),"\ufffc|,|."))
return strings.Join(strings.Fields(keyword)," ")
}
......@@ -78,16 +107,58 @@ func DBC2SBC(s string) string {
var strLst []string
for _, i := range s {
insideCode := i
if insideCode == 12288 {
insideCode = 32
} else {
insideCode -= 65248
}
if insideCode < 32 || insideCode > 126 {
if key,exist := CH_EN_PUNC[string(i)]; exist {
strLst = append(strLst, key)
} else if insideCode < 32 || insideCode > 126 {
strLst = append(strLst, string(i))
} else {
strLst = append(strLst, string(insideCode))
}
}
return strings.Join(strLst, "")
}
\ No newline at end of file
}
// 发送POST请求
// url: 请求地址
// data: POST请求提交的数据
// contentType: 请求体格式,如:application/json
func Post(url string, data interface{}, contentType string) string {
// 超时时间:5秒
client := &http.Client{Timeout: 5 * time.Second}
jsonStr, _ := json.Marshal(data)
resp, err := client.Post(url, contentType, bytes.NewBuffer(jsonStr))
if err != nil {
panic(err)
}
defer resp.Body.Close()
result, _ := ioutil.ReadAll(resp.Body)
return string(result)
}
func sendSuggestNotify() {
msg := Message{
Title:"提示词数据异常",
}
body := list.New()
body.PushBack("提示词数据太少")
msg.Body = body
phones := list.New()
phones.PushBack("17621863255,13894895183")
msg.Phones = phones
Post("http://matrix-inform.secoolocal.com/user/sendToUser", msg,"application/json")
}
......@@ -3,8 +3,7 @@ package main
import (
"bufio"
"context"
"crypto/md5"
"database/sql"
"database/sql"
"fmt"
_ "github.com/go-sql-driver/mysql"
"github.com/liuzl/gocc"
......@@ -19,7 +18,9 @@ import (
"time"
"unicode"
"unicode/utf8"
)
"crypto/md5"
)
type Word struct {
Keyword string `json:"keyword"`
......@@ -42,6 +43,18 @@ type Word struct {
WordRank float64 `json:"wordRank"`
WordABRank float64 `json:"wordABRank"`
KeywordVersion string `json:"keywordVersion"`
WeekUv int32 `json:"-"`
WeekClickUv int32 `json:"-"`
WeekAddCartUv int32 `json:"-"`
MonthPv int32 `json:"-"`
MonthClickCount int32 `json:"-"`
MonthAddCartCount int32 `json:"-"`
MonthUv int32 `json:"-"`
MonthProductClickUv int32 `json:"-"`
MonthAddCartUv int32 `json:"-"`
SuggestTags string `json:"suggestTags"`
UpdateTime int64 `json:"updateTime"`
}
var wordMap sync.Map
......@@ -55,18 +68,25 @@ var dateStr = fmt.Sprintf("%d-%02d-%02d",now.Year(),now.Month(),now.Day())
var t2s, _ = gocc.New("t2s")
var prefixFilterArr = []string{"https://", "http://", "dg", "d & g", "dolce&gabbana",
"dolce & gabbana", "杜嘉班纳", "避孕", "情趣", "cucci", "乒乓球", "cuccl", "gucii"}
"dolce & gabbana", "杜嘉班纳", "避孕", "情趣", "cucci", "乒乓球", "cuccl", "gucii","tod's","iwc7" }
const TABLE_SPLIT_STEP_SIZE = 10000
const LEVEL_SIZE = 1
const MAX_TAG_SIZE = 5
var UPDATE_TIME = time.Now().UnixNano() / 1e6
func main() {
startTime := time.Now()
datawareDB, err := sql.Open("mysql", RUN_ENV.DataWareDB)
if err != nil { log.Print(err.Error()) }
datawareDB.SetConnMaxLifetime(10*time.Minute)
datawareDB.SetMaxOpenConns(50)
datawareDB.SetMaxIdleConns(50)
datawareDB.SetMaxOpenConns(350)
datawareDB.SetMaxIdleConns(100)
var client *elastic.Client
if RUN_ENV.EsUser != "" {
......@@ -78,8 +98,11 @@ func main() {
bulkProcessor, err := elastic.NewBulkProcessorService(client).
Workers(50).
BulkActions(5000).
BulkActions(2000).
FlushInterval(500*time.Millisecond).
Backoff( elastic.NewExponentialBackoff(
time.Duration(10000)*time.Millisecond,
time.Duration(100000)*time.Millisecond) ).
After(after).
Do(context.Background())
if err != nil { log.Print(err.Error()) }
......@@ -94,12 +117,21 @@ func main() {
count := arr[1] / TABLE_SPLIT_STEP_SIZE
log.Printf("maxId/10000=%d\n", count)
for i := 0; i <= count; i++ {
go queryIndex(i * TABLE_SPLIT_STEP_SIZE, datawareDB, bulkProcessor, &wg)
if arr[1] < 2800000 {
log.Printf("data is too little ,return")
sendSuggestNotify()
return
}
for j := 0; j < count; j++ {
wg.Add(1)
go queryIndex( j *TABLE_SPLIT_STEP_SIZE, datawareDB, bulkProcessor, &wg)
}
wg.Wait()
fmt.Println("all thread has read maps")
checkUnusedData(bulkProcessor)
err = bulkProcessor.Flush()
......@@ -130,10 +162,29 @@ func checkUnusedData(bulkProcessor *elastic.BulkProcessor) {
}
}
/** 写入 es 前做下字段清理 */
func cleanForEs(w *Word) {
if w.SuggestTags == "null" || w.SuggestTags == "NULL" {
w.SuggestTags = ""
return
}
var arr = strings.Split(w.SuggestTags,",")
var s = ""
for i,leng := 0, len(arr); i< MAX_TAG_SIZE && i < leng; i++ {
if i == MAX_TAG_SIZE-1 || i == leng -1 {
s = s + arr[i]
} else {
s = s + arr[i] + ","
}
}
w.SuggestTags = s
}
func addWord(w *Word, processor *elastic.BulkProcessor) {
processWord(w)
if !isFilterWord(w) {
wordMap.Store(w.Keyword,w)
cleanForEs(w)
id := fmt.Sprintf("%x", md5.Sum([]byte(w.Keyword)))
req := elastic.NewBulkIndexRequest().
Index("search_suggest_index").
......@@ -149,17 +200,25 @@ func loadErpDB() {
if err != nil { log.Print(err.Error()) }
defer db.Close()
var brandQuery = fmt.Sprintf("select id,en_name,ch_name from secooErpDB.t_product_brand where is_del = 0 and enabled = 1")
var brandQuery = fmt.Sprintf("select id,en_name,ch_name,short_name,nickname from secooErpDB.t_product_brand where is_del = 0 and enabled = 1")
brandResults, err := db.Query(brandQuery)
if err != nil { panic(err.Error()) }
for brandResults.Next() {
var id int
var enName string
var chName string
err = brandResults.Scan(&id, &enName, &chName)
var shortName sql.NullString
var nickName sql.NullString
err = brandResults.Scan(&id, &enName, &chName,&shortName,&nickName)
if err != nil { panic(err.Error()) }
brandMap[cleanKeyword(enName)] = id
brandMap[cleanKeyword(chName)] = id
if _,exist := brandMap[cleanKeyword(shortName.String)]; !exist {
brandMap[cleanKeyword(shortName.String)] = id
}
if _,exist := brandMap[cleanKeyword(nickName.String)]; !exist {
brandMap[cleanKeyword(nickName.String)] = id
}
}
fmt.Println("brandMap size is :", len(brandMap), ", brandMap is ", brandMap)
......@@ -236,13 +295,21 @@ func queryInfo(db *sql.DB) []int {
}
func queryIndex(idFlag int, db *sql.DB, bulkProcessor *elastic.BulkProcessor, wg *sync.WaitGroup) {
wg.Add(1)
// 循环时可能查询到重复数据,应该以id 的上下界来查询
var sqlStr = fmt.Sprintf("select id, keyword, year_pv, year_product_click_count, year_add_cart_count, " +
"week_pv, week_product_click_count, week_add_cart_count, p_day from app_search_keyword_year_week_p_day where id >= %d and id < %d", idFlag, idFlag + TABLE_SPLIT_STEP_SIZE)
"week_pv, week_product_click_count, week_add_cart_count, p_day, " +
"week_uv, week_product_click_uv, week_add_cart_uv, " +
"month_pv, month_product_click_count, month_add_cart_count, month_uv, month_product_click_uv, month_add_cart_uv, prepare_tags " +
"from app_search_keyword_year_week_p_day where id >= %d and id < %d", idFlag, idFlag + TABLE_SPLIT_STEP_SIZE)
log.Print(sqlStr)
results, err := db.Query(sqlStr)
if err != nil { log.Print(err.Error()) }
log.Print("read database success ")
for results.Next() {
var id int
var keyword sql.NullString
......@@ -253,8 +320,37 @@ func queryIndex(idFlag int, db *sql.DB, bulkProcessor *elastic.BulkProcessor, wg
var weekProductClickCount sql.NullInt64
var weekAddCartCount sql.NullInt64
var pDay string
var weekUv sql.NullInt64
var weekClickUv sql.NullInt64
var weekAddCartUv sql.NullInt64
var monthPv sql.NullInt64
var monthClickCount sql.NullInt64
var monthAddCartCount sql.NullInt64
var monthUv sql.NullInt64
var monthProductClickUv sql.NullInt64
var monthAddCartUv sql.NullInt64
var prepareTags sql.NullString
err = results.Scan(&id,
&keyword,
&yearPv,
&yearProductClickCount,
&yearAddCartCount,
&weekPv,
&weekProductClickCount,
&weekAddCartCount,
&pDay,
&weekUv,
&weekClickUv,
&weekAddCartUv,
&monthPv,
&monthClickCount,
&monthAddCartCount,
&monthUv,
&monthProductClickUv,
&monthAddCartUv,
&prepareTags)
err = results.Scan(&id, &keyword, &yearPv, &yearProductClickCount, &yearAddCartCount, &weekPv, &weekProductClickCount, &weekAddCartCount, &pDay)
if err != nil { log.Print(err.Error()) }
if keyword.Valid && len(keyword.String) > 0 && keyword.String != "" {
......@@ -268,7 +364,18 @@ func queryIndex(idFlag int, db *sql.DB, bulkProcessor *elastic.BulkProcessor, wg
WeekCount: int32(weekPv.Int64),
WeekClickCount: int32(weekProductClickCount.Int64),
WeekCartCount: int32(weekAddCartCount.Int64),
KeywordVersion:pDay}
KeywordVersion:pDay,
WeekUv: int32(weekUv.Int64),
WeekClickUv: int32(weekClickUv.Int64),
WeekAddCartUv: int32(weekAddCartUv.Int64),
MonthPv: int32(monthPv.Int64),
MonthClickCount: int32(monthClickCount.Int64),
MonthAddCartCount: int32(monthAddCartCount.Int64),
MonthUv: int32(monthUv.Int64),
MonthProductClickUv: int32(monthProductClickUv.Int64),
MonthAddCartUv: int32(monthAddCartUv.Int64) ,
SuggestTags: prepareTags.String,
UpdateTime: UPDATE_TIME}
if v, isExist := wordMap.Load(key); isExist {
merge(w,v)
......@@ -296,6 +403,19 @@ func merge(word *Word, v interface{}) {
word.WeekCount += t.WeekCount
word.WeekCartCount += t.WeekCartCount
word.WeekClickCount += t.WeekClickCount
word.WeekUv += t.WeekUv
word.WeekClickUv += t.WeekClickUv
word.WeekAddCartUv += t.WeekAddCartUv
word.MonthPv += t.MonthPv
word.MonthClickCount += t.MonthClickCount
word.MonthAddCartCount += t.MonthAddCartCount
word.MonthUv += t.MonthUv
word.MonthProductClickUv += t.MonthProductClickUv
word.MonthAddCartUv += t.MonthAddCartUv
if len(word.SuggestTags) == 0 || "null" == word.SuggestTags || "NULL" == word.SuggestTags {
word.SuggestTags = t.SuggestTags
}
}
func after(executionId int64, requests []elastic.BulkableRequest, response *elastic.BulkResponse, err error) {
......@@ -304,22 +424,26 @@ func after(executionId int64, requests []elastic.BulkableRequest, response *elas
func processWord(w *Word) {
w.KeywordPinYin = convertToPinyin(w.Keyword)
// 年点击加购率
w.YearClickRatio = calculateRatio(w.YearClickCount, w.YearCount)
w.YearCartRatio = calculateRatio(w.YearCartCount, w.YearCount)
// 周点击加购率
w.WeekClickRatio = calculateRatio(w.WeekClickCount, w.WeekCount)
w.WeekCartRatio = calculateRatio(w.WeekCartCount, w.WeekCount)
// 非默认值,加权
// 年加购率 再加权
if w.YearCount != 0 && w.YearCartCount != 0 {
w.YearCartRatio *= 3
}
// 非默认值,加权
// 周加购率 再加权
if w.WeekCount != 0 && w.WeekCartCount != 0 {
w.WeekCartRatio *= 3
}
// 非默认值,加权
// 周点击率 再加权
if w.WeekCount != 0 && w.WeekClickCount != 0 {
w.WeekClickRatio *= 2
}
......@@ -352,22 +476,22 @@ func isFilterWord(w *Word) bool {
if w.IsSensitive { return true }
// 过滤掉太长的词 每个中文字占3个byte
if utf8.RuneCountInString(w.Keyword) <= 1 || len(w.Keyword) > 60 { return true }
if utf8.RuneCountInString(w.Keyword) <= 1 || len(w.Keyword) > 50 { return true }
// 过滤掉商品id,商品id是有7位数字组成
if len(w.Keyword) > 6 && isAllDigit(w.Keyword) { return true }
// 品牌词 类目词 人工干预词 不做过滤
if w.IsBrand || w.IsCategory || w.IsManual { return false }
// 年数据过滤
if w.YearCount == 0 || w.YearClickCount == 0 { return true }
// 前缀过滤
for _, v := range prefixFilterArr {
if strings.HasPrefix(w.Keyword, v) { return true }
}
// 品牌词 类目词 人工干预词 不做过滤
if w.IsBrand || w.IsCategory || w.IsManual { return false }
// 年数据过滤
if w.YearCount < 2 || w.YearClickCount < 2 { return true }
// 判断是否是热搜词 一年内搜索次数大于50或者一周内搜索次数大于5
if isHotSearchWord(w) {
// 搜索次数比较多 转化率或者点击率较高的 不过滤
......@@ -405,14 +529,22 @@ func calculateRatio(numerator int32, denominator int32) float64 {
}
func calculateWordRank(w *Word) {
wordRank := 10000.0
// 长度因子
wordRank += 3000 * calculateLengthFactor(len(w.Keyword))
// 年数量因子
wordRank += 2000 * calculateCountFactor(w.YearCount, 1)
// 周数量因子
wordRank += 2000 * calculateCountFactor(w.WeekCount, 52)
// 年点击率因子
wordRank += 3000 * calculateRatioFactor(w.YearClickRatio, w.YearClickCount)
// 周点击率因子
wordRank += 3000 * calculateRatioFactor(w.WeekClickRatio, w.WeekClickCount)
// 年加购率因子
wordRank += 3000 * calculateRatioFactor(w.YearCartRatio, w.YearCartCount)
// 周加购率因子
wordRank += 3000 * calculateRatioFactor(w.WeekCartRatio, w.WeekCartCount)
if w.IsBrand { wordRank *= 1.8 }
if w.IsCategory { wordRank *= 1.2 }
......@@ -421,19 +553,60 @@ func calculateWordRank(w *Word) {
}
func calculateWordABRank(w *Word) {
// 月点击加购率
monthClickRatio := calculateRatio(w.MonthProductClickUv, w.MonthUv)
monthCartRatio := calculateRatio(w.MonthAddCartUv, w.MonthUv)
// 周点击加购率(和A相比, count 换成了uv)
weekClickRatioNew := calculateRatio(w.WeekClickUv, w.WeekUv)
weekCartRatioNew := calculateRatio(w.WeekAddCartUv, w.WeekUv)
// 月点击
if w.MonthProductClickUv != 0 && w.MonthUv != 0 {
monthClickRatio *= 1.5
}
// 月加购,加权
if w.MonthAddCartUv != 0 && w.MonthUv != 0 {
monthCartRatio *= 3
}
// 周点击,加权
if w.WeekClickUv != 0 && w.WeekUv != 0 {
weekClickRatioNew *= 2
}
// 周加购,加权
if w.WeekAddCartUv != 0 && w.WeekUv != 0 {
weekCartRatioNew *= 3
}
wordABRank := 10000.0
// 长度因子
wordABRank += 3000 * calculateLengthFactor(len(w.Keyword))
// 月数量因子
wordABRank += 2000 * calculateCountFactor(w.MonthUv, 4)
// 周数量因子
wordABRank += 2000 * calculateCountFactor(w.WeekUv, 52)
// 年数量因子
wordABRank += 2000 * calculateCountFactor(w.YearCount, 1)
wordABRank += 2000 * calculateCountFactor(w.WeekCount, 52)
// 点击
// 年点击改为 2000
wordABRank += 2000 * calculateRatioFactor(w.YearClickRatio, w.YearClickCount)
wordABRank += 3000 * calculateRatioFactor(w.WeekClickRatio, w.WeekClickCount)
// 月点击率因子
wordABRank += 3000 * calculateRatioFactor(monthClickRatio, w.MonthProductClickUv)
// 周点击率因子
wordABRank += 3000 * calculateRatioFactor(weekClickRatioNew, w.WeekUv)
// 加购
// 年加购率因子
wordABRank += 3000 * calculateRatioFactor(w.YearCartRatio, w.YearCartCount)
wordABRank += 3000 * calculateRatioFactor(w.WeekCartRatio, w.WeekCartCount)
// 月加购率因子
wordABRank += 3000 * calculateRatioFactor(monthCartRatio, w.MonthUv)
// 周加购率因子
wordABRank += 3000 * calculateRatioFactor(weekCartRatioNew, w.WeekUv)
if w.IsBrand { wordABRank *= 1.8 }
if w.IsCategory { wordABRank *= 1.2 }
......
package main
import (
"encoding/json"
"math"
"strings"
"fmt"
)
......@@ -13,20 +14,29 @@ type B struct {
YearCartCount int32 `json:"yearCartCount"`
ZhaoCount int32 `json:"-"`
}
func main() {
b := B{
Keyword: "赵延超",
KeywordPinYin: "zhaoyanchao",
YearCount: 1000,
YearCartCount: 100,
YearClickCount: 10,
ZhaoCount: 2}
if jsonBytes,errs := json.Marshal(b); errs == nil {
fmt.Print(string(jsonBytes))
prefix := strings.HasPrefix("tod's", "tod's")
fmt.Print(prefix)
}
func calculateRatioFactor2(ratio float64, count int32) float64 {
var rank float64
switch {
case count > 1 && count < 10 : rank = 1.2
case count >= 10 && count < 20 : rank = 1.4
case count >= 20 && count < 50 : rank = 1.6
case count >= 50 && count < 100 : rank = 1.8
case count >= 100 && count < 200 : rank = 2.0
case count >= 200 && count < 500 : rank = 2.2
case count >= 500 : rank = 2.5
default:rank = 1.0
}
//根据搜索转化率,转换为热度因子
return math.Log10(math.Sqrt(ratio + 10)) * rank
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment