Commit 3b53306e by zhaoyanchao

1. 删除多余的数据结构 usedMap, 使用wordMap 判断是否有使用

2. 添加suggest-task-dependency.go 文件,便于快速切换测试和生产环境
parent 6249c39b
package main
type ENV struct {
DataWareDB string
ErpDB string
EsInfo string
EsUser string
EsPassword string
ManualFolder string
SensitiveFolder string
}
var test_env = &ENV{
DataWareDB: "DataWarehouse_test:FihdZW7o1XKtDETZexOG@tcp(test01-secooDataWarehouse.master.com:3306)/secooDataWarehouse",
ErpDB: "3306_test:iS6CXpYqgZ8Mhjui@tcp(10.4.3.223:3306)/secooErpDB",
EsInfo: "http://localhost:9200",
EsUser: "",
EsPassword: "",
ManualFolder: "D:\\DataFiles\\suggest_corpus-20180801\\manual",
SensitiveFolder: "D:\\DataFiles\\suggest_corpus-20180801\\sensitive"}
var prod_env = &ENV {
DataWareDB: "Search_DataWar_R:pY1P9zUj9x1M65ot5szo@tcp(secooDataWarehouse.slave.com:3306)/secooDataWarehouse",
ErpDB: "so_Erp_R:5RgzudyyFlApTmve@tcp(192.168.50.40:3306)/secooErpDB",
EsInfo: "http://bigdataescluster.secoolocal.com:9200",
EsUser: "search",
EsPassword: "search5z0NvEn1D",
ManualFolder: "/data/pssmaster/corpus_set/suggest_corpus/manual",
SensitiveFolder: "/data/pssmaster/corpus_set/suggest_corpus/sensitive"}
var RUN_ENV = prod_env
\ No newline at end of file
......@@ -51,8 +51,6 @@ var categoryMap = make(map[string]int)
var manualMap = make(map[string]int32)
var sensitiveMap = make(map[string]bool)
var usedMap sync.Map
var now = time.Now()
var dateStr = fmt.Sprintf("%d-%d-%d",now.Year(),now.Month(),now.Day())
......@@ -60,47 +58,43 @@ var t2s, _ = gocc.New("t2s")
var prefixFilterArr = []string{"https://", "http://", "dg", "d & g", "dolce&gabbana",
"dolce & gabbana", "杜嘉班纳", "避孕", "情趣", "cucci", "乒乓球", "cuccl", "gucii"}
var testDatawareDBInfo = "root:1234@tcp(localhost:3306)/secooErpDB"
var testErpDBInfo = "3306_test:iS6CXpYqgZ8Mhjui@tcp(10.4.3.223:3306)/secooErpDB"
var testESInfo = "http://localhost:9200"
var testManualFolder = "D:\\Code\\suggest_corpus-20180801\\manual"
var testSensitiveFolder = "D:\\Code\\suggest_corpus-20180801\\sensitive"
func main() {
startTime := time.Now()
//db, err := sql.Open("mysql", "Search_DataWar_R:pY1P9zUj9x1M65ot5szo@tcp(secooDataWarehouse.slave.com:3306)/secooDataWarehouse")
db, err := sql.Open("mysql", testDatawareDBInfo)
if err != nil { log.Print(err.Error()) }
datawareDB, err := sql.Open("mysql", RUN_ENV.DataWareDB)
if err != nil {
log.Print(err.Error())
}
client, err := elastic.NewClient(elastic.SetURL(testESInfo))
// http://bigdataescluster.secoolocal.com:9200"
//elastic.SetBasicAuth("search", "search5z0NvEn1D"))
var client *elastic.Client
if RUN_ENV.EsUser != "" {
client,err = elastic.NewClient(elastic.SetURL(RUN_ENV.EsInfo),elastic.SetBasicAuth(RUN_ENV.EsUser, RUN_ENV.EsPassword))
} else {
client,err = elastic.NewClient(elastic.SetURL(RUN_ENV.EsInfo))
}
if err != nil { log.Print(err.Error()) }
bulkProcessor, err := elastic.NewBulkProcessorService(client).
Workers(50).
BulkActions(10000).
FlushInterval(1 * time.Second).
BulkActions(5000).
FlushInterval(500*time.Millisecond).
After(after).
Do(context.Background())
if err != nil { log.Print(err.Error()) }
loadErpDB()
manualFolder := testManualFolder
sensitiveFolder := testSensitiveFolder
loadManual(manualFolder)
loadSensitive(sensitiveFolder)
loadManual(RUN_ENV.ManualFolder)
loadSensitive(RUN_ENV.SensitiveFolder)
var wg sync.WaitGroup
arr := queryInfo(db)
arr := queryInfo(datawareDB)
if arr[0] > 1000000 {
count := arr[1] / 10000
log.Printf("maxId/10000=%d\n", count)
for i := 0; i <= count; i++ {
go queryIndex(i*10000, db, bulkProcessor, &wg, wordMap)
go queryIndex(i*10000, datawareDB, bulkProcessor, &wg, wordMap)
}
}
wg.Wait()
......@@ -110,14 +104,14 @@ func main() {
err = bulkProcessor.Flush()
if err != nil { log.Print(err.Error()) }
defer db.Close()
defer datawareDB.Close()
fmt.Printf("Cost %d ms\n", time.Since(startTime).Nanoseconds()/1e6)
}
func checkUnusedData(bulkProcessor *elastic.BulkProcessor) {
var tmpMap = make(map[string]bool)
for brand := range brandMap {
tmpMap[brand] = true;
tmpMap[brand] = true
}
for category := range categoryMap {
tmpMap[category] = true
......@@ -126,7 +120,7 @@ func checkUnusedData(bulkProcessor *elastic.BulkProcessor) {
tmpMap[manual] = true
}
for word := range tmpMap {
if _, exist := usedMap.Load(word); !exist {
if _, exist := wordMap.Load(word); !exist {
addWord(word, bulkProcessor)
}
}
......@@ -146,17 +140,10 @@ func addWord(keyword string, processor *elastic.BulkProcessor) {
}
}
func productEnv() {
testDatawareDBInfo = "Search_DataWar_R:pY1P9zUj9x1M65ot5szo@tcp(secooDataWarehouse.slave.com:3306)/secooDataWarehouse"
testErpDBInfo = "3306_test:iS6CXpYqgZ8Mhjui@tcp(10.4.3.223:3306)/secooErpDB"
testESInfo = "http://bigdataescluster.secoolocal.com:9200"
testManualFolder = "/data/pssmaster/corpus_set/suggest_corpus/manual"
testSensitiveFolder = "/data/pssmaster/corpus_set/suggest_corpus/sensitive"
}
func loadErpDB() {
//db, err := sql.Open("mysql", "so_Erp_R:5RgzudyyFlApTmve@tcp(192.168.50.40:3306)/secooErpDB")
db, err := sql.Open("mysql", testErpDBInfo)
db, err := sql.Open("mysql", RUN_ENV.ErpDB)
if err != nil { log.Print(err.Error()) }
defer db.Close()
......@@ -336,18 +323,15 @@ func processWord(w *Word) {
if _, isExist := brandMap[w.Keyword]; isExist {
w.IsBrand = true
usedMap.Store(w.Keyword, true)
}
if _, isExist := manualMap[w.Keyword]; isExist {
w.IsManual = true
w.ManualValue = manualMap[w.Keyword]
usedMap.Store(w.Keyword,true)
}
if _, isExist := categoryMap[w.Keyword]; isExist {
w.IsCategory = true
usedMap.Store(w.Keyword, true)
}
if _, isExist := sensitiveMap[w.Keyword]; isExist {
......@@ -358,8 +342,9 @@ func processWord(w *Word) {
}
func cleanKeyword(keyword string) string {
fmt.Println(keyword)
out, err := t2s.Convert(keyword)
if err != nil { fmt.Println(err) }
if err != nil { fmt.Println(err) }
keyword = strings.TrimSpace(strings.ToLower(out))
return strings.Join(strings.Fields(keyword)," ")
}
......
package main
import (
"time"
"fmt"
"github.com/liuzl/gocc"
"strings"
)
type T struct {
Keyword string `json:"keyword"`
KeywordPinYin string `json:"keywordPinYin"`
YearClickCount int32 `json:"yearClickCount"`
YearCartCount int32 `json:"yearCartCount"`
YearCount int32 `json:"yearCount"`
WeekClickCount int32 `json:"weekClickCount"`
WeekCartCount int32 `json:"weekCartCount"`
WeekCount int32 `json:"weekCount"`
YearClickRatio float64 `json:"yearClickRatio"`
YearCartRatio float64 `json:"yearCartRatio"`
WeekClickRatio float64 `json:"weekClickRatio"`
WeekCartRatio float64 `json:"weekCartRatio"`
IsBrand bool `json:"isBrand"`
IsCategory bool `json:"isCategory"`
IsManual bool `json:"isManual"`
IsSensitive bool `json:"isSensitive"`
ManualValue int32 `json:"manualValue"`
WordRank float64 `json:"wordRank"`
KeywordVersion string `json:"keywordVersion"`
}
func main() {
var w = new(T)
var t = time.Now()
w.KeywordVersion = fmt.Sprintf("%d-%d-%d",t.Year(),t.Month(),t.Day())
fmt.Println(w.KeywordVersion)
var t2s, _ = gocc.New("t2s")
var _, err = t2s.Convert("中國")
if err != nil { fmt.Println("succ")}
var s = "意尔康 男 鞋"
//var re, _ = regexp.Compile("\\s+")
//var st = re.ReplaceAllLiteralString(s," ")
fmt.Println(strings.Join(strings.Fields(s),""))
fmt.Println(H)
}
......@@ -21,7 +21,7 @@ import (
"time"
"unicode"
"unicode/utf8"
)
)
type Word struct {
Keyword string `json:"keyword"`
......@@ -49,7 +49,6 @@ var brandMap = make(map[string]int)
var categoryMap = make(map[string]int)
var manualMap = make(map[string]int32)
var sensitiveMap = make(map[string]bool)
var usedMap sync.Map
var now = time.Now()
var dateStr = fmt.Sprintf("%d-%d-%d",now.Year(),now.Month(),now.Day())
......@@ -107,7 +106,7 @@ func main() {
func checkUnusedData(bulkProcessor *elastic.BulkProcessor) {
var tmpMap = make(map[string]bool)
for brand := range brandMap {
tmpMap[brand] = true;
tmpMap[brand] = true
}
for category := range categoryMap {
tmpMap[category] = true
......@@ -116,18 +115,19 @@ func checkUnusedData(bulkProcessor *elastic.BulkProcessor) {
tmpMap[manual] = true
}
for word := range tmpMap {
if _, exist := usedMap.Load(word); !exist {
if _, exist := wordMap.Load(word); !exist {
addWord(word, bulkProcessor)
}
}
}
func addWord(keyword string, processor *elastic.BulkProcessor, wordType string) {
func addWord(keyword string, processor *elastic.BulkProcessor) {
var w = new(Word)
w.Keyword = keyword
w.KeywordVersion = dateStr
processWord(w)
if !isFilterWord(w) {
wordMap.Store(keyword,1)
id := fmt.Sprintf("%x", md5.Sum([]byte(w.Keyword)))
req := elastic.NewBulkIndexRequest().
Index("search_suggest_index").
......@@ -313,18 +313,15 @@ func processWord(w *Word) {
if _, isExist := brandMap[w.Keyword]; isExist {
w.IsBrand = true
usedMap.Store(w.Keyword, true)
}
if _, isExist := manualMap[w.Keyword]; isExist {
w.IsManual = true
w.ManualValue = manualMap[w.Keyword]
usedMap.Store(w.Keyword,true)
}
if _, isExist := categoryMap[w.Keyword]; isExist {
w.IsCategory = true
usedMap.Store(w.Keyword, true)
}
if _, isExist := sensitiveMap[w.Keyword]; isExist {
......@@ -334,6 +331,9 @@ func processWord(w *Word) {
calculateWordRank(w)
}
// traditional chinese to simple chinese
// chinese trim
// english remove redudent blank char
func cleanKeyword(keyword string) string {
out, err := t2s.Convert(keyword)
if err != nil { fmt.Println(err) }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment