Commit 3b53306e by zhaoyanchao

1. 删除多余的数据结构 usedMap, 使用wordMap 判断是否有使用

2. 添加suggest-task-dependency.go 文件,便于快速切换测试和生产环境
parent 6249c39b
package main
type ENV struct {
DataWareDB string
ErpDB string
EsInfo string
EsUser string
EsPassword string
ManualFolder string
SensitiveFolder string
}
var test_env = &ENV{
DataWareDB: "DataWarehouse_test:FihdZW7o1XKtDETZexOG@tcp(test01-secooDataWarehouse.master.com:3306)/secooDataWarehouse",
ErpDB: "3306_test:iS6CXpYqgZ8Mhjui@tcp(10.4.3.223:3306)/secooErpDB",
EsInfo: "http://localhost:9200",
EsUser: "",
EsPassword: "",
ManualFolder: "D:\\DataFiles\\suggest_corpus-20180801\\manual",
SensitiveFolder: "D:\\DataFiles\\suggest_corpus-20180801\\sensitive"}
var prod_env = &ENV {
DataWareDB: "Search_DataWar_R:pY1P9zUj9x1M65ot5szo@tcp(secooDataWarehouse.slave.com:3306)/secooDataWarehouse",
ErpDB: "so_Erp_R:5RgzudyyFlApTmve@tcp(192.168.50.40:3306)/secooErpDB",
EsInfo: "http://bigdataescluster.secoolocal.com:9200",
EsUser: "search",
EsPassword: "search5z0NvEn1D",
ManualFolder: "/data/pssmaster/corpus_set/suggest_corpus/manual",
SensitiveFolder: "/data/pssmaster/corpus_set/suggest_corpus/sensitive"}
var RUN_ENV = prod_env
\ No newline at end of file
...@@ -51,8 +51,6 @@ var categoryMap = make(map[string]int) ...@@ -51,8 +51,6 @@ var categoryMap = make(map[string]int)
var manualMap = make(map[string]int32) var manualMap = make(map[string]int32)
var sensitiveMap = make(map[string]bool) var sensitiveMap = make(map[string]bool)
var usedMap sync.Map
var now = time.Now() var now = time.Now()
var dateStr = fmt.Sprintf("%d-%d-%d",now.Year(),now.Month(),now.Day()) var dateStr = fmt.Sprintf("%d-%d-%d",now.Year(),now.Month(),now.Day())
...@@ -60,47 +58,43 @@ var t2s, _ = gocc.New("t2s") ...@@ -60,47 +58,43 @@ var t2s, _ = gocc.New("t2s")
var prefixFilterArr = []string{"https://", "http://", "dg", "d & g", "dolce&gabbana", var prefixFilterArr = []string{"https://", "http://", "dg", "d & g", "dolce&gabbana",
"dolce & gabbana", "杜嘉班纳", "避孕", "情趣", "cucci", "乒乓球", "cuccl", "gucii"} "dolce & gabbana", "杜嘉班纳", "避孕", "情趣", "cucci", "乒乓球", "cuccl", "gucii"}
var testDatawareDBInfo = "root:1234@tcp(localhost:3306)/secooErpDB"
var testErpDBInfo = "3306_test:iS6CXpYqgZ8Mhjui@tcp(10.4.3.223:3306)/secooErpDB"
var testESInfo = "http://localhost:9200"
var testManualFolder = "D:\\Code\\suggest_corpus-20180801\\manual"
var testSensitiveFolder = "D:\\Code\\suggest_corpus-20180801\\sensitive"
func main() { func main() {
startTime := time.Now() startTime := time.Now()
//db, err := sql.Open("mysql", "Search_DataWar_R:pY1P9zUj9x1M65ot5szo@tcp(secooDataWarehouse.slave.com:3306)/secooDataWarehouse")
db, err := sql.Open("mysql", testDatawareDBInfo) datawareDB, err := sql.Open("mysql", RUN_ENV.DataWareDB)
if err != nil { log.Print(err.Error()) } if err != nil {
log.Print(err.Error())
}
client, err := elastic.NewClient(elastic.SetURL(testESInfo)) var client *elastic.Client
// http://bigdataescluster.secoolocal.com:9200" if RUN_ENV.EsUser != "" {
//elastic.SetBasicAuth("search", "search5z0NvEn1D")) client,err = elastic.NewClient(elastic.SetURL(RUN_ENV.EsInfo),elastic.SetBasicAuth(RUN_ENV.EsUser, RUN_ENV.EsPassword))
} else {
client,err = elastic.NewClient(elastic.SetURL(RUN_ENV.EsInfo))
}
if err != nil { log.Print(err.Error()) } if err != nil { log.Print(err.Error()) }
bulkProcessor, err := elastic.NewBulkProcessorService(client). bulkProcessor, err := elastic.NewBulkProcessorService(client).
Workers(50). Workers(50).
BulkActions(10000). BulkActions(5000).
FlushInterval(1 * time.Second). FlushInterval(500*time.Millisecond).
After(after). After(after).
Do(context.Background()) Do(context.Background())
if err != nil { log.Print(err.Error()) } if err != nil { log.Print(err.Error()) }
loadErpDB() loadErpDB()
manualFolder := testManualFolder loadManual(RUN_ENV.ManualFolder)
sensitiveFolder := testSensitiveFolder loadSensitive(RUN_ENV.SensitiveFolder)
loadManual(manualFolder)
loadSensitive(sensitiveFolder)
var wg sync.WaitGroup var wg sync.WaitGroup
arr := queryInfo(db) arr := queryInfo(datawareDB)
if arr[0] > 1000000 { if arr[0] > 1000000 {
count := arr[1] / 10000 count := arr[1] / 10000
log.Printf("maxId/10000=%d\n", count) log.Printf("maxId/10000=%d\n", count)
for i := 0; i <= count; i++ { for i := 0; i <= count; i++ {
go queryIndex(i*10000, db, bulkProcessor, &wg, wordMap) go queryIndex(i*10000, datawareDB, bulkProcessor, &wg, wordMap)
} }
} }
wg.Wait() wg.Wait()
...@@ -110,14 +104,14 @@ func main() { ...@@ -110,14 +104,14 @@ func main() {
err = bulkProcessor.Flush() err = bulkProcessor.Flush()
if err != nil { log.Print(err.Error()) } if err != nil { log.Print(err.Error()) }
defer db.Close() defer datawareDB.Close()
fmt.Printf("Cost %d ms\n", time.Since(startTime).Nanoseconds()/1e6) fmt.Printf("Cost %d ms\n", time.Since(startTime).Nanoseconds()/1e6)
} }
func checkUnusedData(bulkProcessor *elastic.BulkProcessor) { func checkUnusedData(bulkProcessor *elastic.BulkProcessor) {
var tmpMap = make(map[string]bool) var tmpMap = make(map[string]bool)
for brand := range brandMap { for brand := range brandMap {
tmpMap[brand] = true; tmpMap[brand] = true
} }
for category := range categoryMap { for category := range categoryMap {
tmpMap[category] = true tmpMap[category] = true
...@@ -126,7 +120,7 @@ func checkUnusedData(bulkProcessor *elastic.BulkProcessor) { ...@@ -126,7 +120,7 @@ func checkUnusedData(bulkProcessor *elastic.BulkProcessor) {
tmpMap[manual] = true tmpMap[manual] = true
} }
for word := range tmpMap { for word := range tmpMap {
if _, exist := usedMap.Load(word); !exist { if _, exist := wordMap.Load(word); !exist {
addWord(word, bulkProcessor) addWord(word, bulkProcessor)
} }
} }
...@@ -146,17 +140,10 @@ func addWord(keyword string, processor *elastic.BulkProcessor) { ...@@ -146,17 +140,10 @@ func addWord(keyword string, processor *elastic.BulkProcessor) {
} }
} }
func productEnv() {
testDatawareDBInfo = "Search_DataWar_R:pY1P9zUj9x1M65ot5szo@tcp(secooDataWarehouse.slave.com:3306)/secooDataWarehouse"
testErpDBInfo = "3306_test:iS6CXpYqgZ8Mhjui@tcp(10.4.3.223:3306)/secooErpDB"
testESInfo = "http://bigdataescluster.secoolocal.com:9200"
testManualFolder = "/data/pssmaster/corpus_set/suggest_corpus/manual"
testSensitiveFolder = "/data/pssmaster/corpus_set/suggest_corpus/sensitive"
}
func loadErpDB() { func loadErpDB() {
//db, err := sql.Open("mysql", "so_Erp_R:5RgzudyyFlApTmve@tcp(192.168.50.40:3306)/secooErpDB") //db, err := sql.Open("mysql", "so_Erp_R:5RgzudyyFlApTmve@tcp(192.168.50.40:3306)/secooErpDB")
db, err := sql.Open("mysql", testErpDBInfo) db, err := sql.Open("mysql", RUN_ENV.ErpDB)
if err != nil { log.Print(err.Error()) } if err != nil { log.Print(err.Error()) }
defer db.Close() defer db.Close()
...@@ -336,18 +323,15 @@ func processWord(w *Word) { ...@@ -336,18 +323,15 @@ func processWord(w *Word) {
if _, isExist := brandMap[w.Keyword]; isExist { if _, isExist := brandMap[w.Keyword]; isExist {
w.IsBrand = true w.IsBrand = true
usedMap.Store(w.Keyword, true)
} }
if _, isExist := manualMap[w.Keyword]; isExist { if _, isExist := manualMap[w.Keyword]; isExist {
w.IsManual = true w.IsManual = true
w.ManualValue = manualMap[w.Keyword] w.ManualValue = manualMap[w.Keyword]
usedMap.Store(w.Keyword,true)
} }
if _, isExist := categoryMap[w.Keyword]; isExist { if _, isExist := categoryMap[w.Keyword]; isExist {
w.IsCategory = true w.IsCategory = true
usedMap.Store(w.Keyword, true)
} }
if _, isExist := sensitiveMap[w.Keyword]; isExist { if _, isExist := sensitiveMap[w.Keyword]; isExist {
...@@ -358,8 +342,9 @@ func processWord(w *Word) { ...@@ -358,8 +342,9 @@ func processWord(w *Word) {
} }
func cleanKeyword(keyword string) string { func cleanKeyword(keyword string) string {
fmt.Println(keyword)
out, err := t2s.Convert(keyword) out, err := t2s.Convert(keyword)
if err != nil { fmt.Println(err) } if err != nil { fmt.Println(err) }
keyword = strings.TrimSpace(strings.ToLower(out)) keyword = strings.TrimSpace(strings.ToLower(out))
return strings.Join(strings.Fields(keyword)," ") return strings.Join(strings.Fields(keyword)," ")
} }
......
package main package main
import ( import (
"time"
"fmt" "fmt"
"github.com/liuzl/gocc"
"strings"
) )
type T struct {
Keyword string `json:"keyword"`
KeywordPinYin string `json:"keywordPinYin"`
YearClickCount int32 `json:"yearClickCount"`
YearCartCount int32 `json:"yearCartCount"`
YearCount int32 `json:"yearCount"`
WeekClickCount int32 `json:"weekClickCount"`
WeekCartCount int32 `json:"weekCartCount"`
WeekCount int32 `json:"weekCount"`
YearClickRatio float64 `json:"yearClickRatio"`
YearCartRatio float64 `json:"yearCartRatio"`
WeekClickRatio float64 `json:"weekClickRatio"`
WeekCartRatio float64 `json:"weekCartRatio"`
IsBrand bool `json:"isBrand"`
IsCategory bool `json:"isCategory"`
IsManual bool `json:"isManual"`
IsSensitive bool `json:"isSensitive"`
ManualValue int32 `json:"manualValue"`
WordRank float64 `json:"wordRank"`
KeywordVersion string `json:"keywordVersion"`
}
func main() { func main() {
var w = new(T) var t2s, _ = gocc.New("t2s")
var t = time.Now() var _, err = t2s.Convert("中國")
w.KeywordVersion = fmt.Sprintf("%d-%d-%d",t.Year(),t.Month(),t.Day()) if err != nil { fmt.Println("succ")}
fmt.Println(w.KeywordVersion)
var s = "意尔康 男 鞋"
//var re, _ = regexp.Compile("\\s+")
//var st = re.ReplaceAllLiteralString(s," ")
fmt.Println(strings.Join(strings.Fields(s),""))
fmt.Println(H)
} }
...@@ -21,7 +21,7 @@ import ( ...@@ -21,7 +21,7 @@ import (
"time" "time"
"unicode" "unicode"
"unicode/utf8" "unicode/utf8"
) )
type Word struct { type Word struct {
Keyword string `json:"keyword"` Keyword string `json:"keyword"`
...@@ -49,7 +49,6 @@ var brandMap = make(map[string]int) ...@@ -49,7 +49,6 @@ var brandMap = make(map[string]int)
var categoryMap = make(map[string]int) var categoryMap = make(map[string]int)
var manualMap = make(map[string]int32) var manualMap = make(map[string]int32)
var sensitiveMap = make(map[string]bool) var sensitiveMap = make(map[string]bool)
var usedMap sync.Map
var now = time.Now() var now = time.Now()
var dateStr = fmt.Sprintf("%d-%d-%d",now.Year(),now.Month(),now.Day()) var dateStr = fmt.Sprintf("%d-%d-%d",now.Year(),now.Month(),now.Day())
...@@ -107,7 +106,7 @@ func main() { ...@@ -107,7 +106,7 @@ func main() {
func checkUnusedData(bulkProcessor *elastic.BulkProcessor) { func checkUnusedData(bulkProcessor *elastic.BulkProcessor) {
var tmpMap = make(map[string]bool) var tmpMap = make(map[string]bool)
for brand := range brandMap { for brand := range brandMap {
tmpMap[brand] = true; tmpMap[brand] = true
} }
for category := range categoryMap { for category := range categoryMap {
tmpMap[category] = true tmpMap[category] = true
...@@ -116,18 +115,19 @@ func checkUnusedData(bulkProcessor *elastic.BulkProcessor) { ...@@ -116,18 +115,19 @@ func checkUnusedData(bulkProcessor *elastic.BulkProcessor) {
tmpMap[manual] = true tmpMap[manual] = true
} }
for word := range tmpMap { for word := range tmpMap {
if _, exist := usedMap.Load(word); !exist { if _, exist := wordMap.Load(word); !exist {
addWord(word, bulkProcessor) addWord(word, bulkProcessor)
} }
} }
} }
func addWord(keyword string, processor *elastic.BulkProcessor, wordType string) { func addWord(keyword string, processor *elastic.BulkProcessor) {
var w = new(Word) var w = new(Word)
w.Keyword = keyword w.Keyword = keyword
w.KeywordVersion = dateStr w.KeywordVersion = dateStr
processWord(w) processWord(w)
if !isFilterWord(w) { if !isFilterWord(w) {
wordMap.Store(keyword,1)
id := fmt.Sprintf("%x", md5.Sum([]byte(w.Keyword))) id := fmt.Sprintf("%x", md5.Sum([]byte(w.Keyword)))
req := elastic.NewBulkIndexRequest(). req := elastic.NewBulkIndexRequest().
Index("search_suggest_index"). Index("search_suggest_index").
...@@ -313,18 +313,15 @@ func processWord(w *Word) { ...@@ -313,18 +313,15 @@ func processWord(w *Word) {
if _, isExist := brandMap[w.Keyword]; isExist { if _, isExist := brandMap[w.Keyword]; isExist {
w.IsBrand = true w.IsBrand = true
usedMap.Store(w.Keyword, true)
} }
if _, isExist := manualMap[w.Keyword]; isExist { if _, isExist := manualMap[w.Keyword]; isExist {
w.IsManual = true w.IsManual = true
w.ManualValue = manualMap[w.Keyword] w.ManualValue = manualMap[w.Keyword]
usedMap.Store(w.Keyword,true)
} }
if _, isExist := categoryMap[w.Keyword]; isExist { if _, isExist := categoryMap[w.Keyword]; isExist {
w.IsCategory = true w.IsCategory = true
usedMap.Store(w.Keyword, true)
} }
if _, isExist := sensitiveMap[w.Keyword]; isExist { if _, isExist := sensitiveMap[w.Keyword]; isExist {
...@@ -334,6 +331,9 @@ func processWord(w *Word) { ...@@ -334,6 +331,9 @@ func processWord(w *Word) {
calculateWordRank(w) calculateWordRank(w)
} }
// traditional chinese to simple chinese
// chinese trim
// english remove redudent blank char
func cleanKeyword(keyword string) string { func cleanKeyword(keyword string) string {
out, err := t2s.Convert(keyword) out, err := t2s.Convert(keyword)
if err != nil { fmt.Println(err) } if err != nil { fmt.Println(err) }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment