Commit c70de180 by zhaoyanchao

过滤掉特殊字符 '\ufffc', 全角输入的词转为半角

parent 78e27da4
.idea/ .idea/
main/info.log main/info.log
main/test.go
\ No newline at end of file
package main package main
import ( import (
"container/list"
"strings" "strings"
"github.com/mozillazg/go-pinyin" "github.com/mozillazg/go-pinyin"
"fmt" "fmt"
...@@ -41,36 +40,6 @@ var RUN_ENV = prod_env ...@@ -41,36 +40,6 @@ var RUN_ENV = prod_env
/************************* 下面是 util 方法 *****************************/ /************************* 下面是 util 方法 *****************************/
// 求阶乘
func factorial(n int64) int64 {
if n == 1 { return 1}
return n * factorial(n-1)
}
// 求数组的全排列, 放置到list 中
func permutation(arr [] string, begin int, lst *list.List) {
if begin == len(arr) {
lst.PushBack(strings.Join(arr, " "))
return
}
for i := begin; i < len(arr); i++ {
tmp := arr[begin]
arr[begin] = arr[i]
arr[i] = tmp
permutation(arr, begin +1,lst)
arr[i] = arr[begin]
arr[begin] = tmp
}
}
//func main() {
// var h = []string{"a","b","c","d"}
// lst := list.New()
// permutation(h,1,lst)
// for p := lst.Front(); p != nil; p = p.Next() {
// fmt.Println(p.Value)
// }
//}
func convertToPinyin(str string) string { func convertToPinyin(str string) string {
var ret string var ret string
...@@ -91,7 +60,7 @@ func convertToPinyin(str string) string { ...@@ -91,7 +60,7 @@ func convertToPinyin(str string) string {
func cleanKeyword(keyword string) string { func cleanKeyword(keyword string) string {
out, err := t2s.Convert(keyword) out, err := t2s.Convert(keyword)
if err != nil { fmt.Println(err) } if err != nil { fmt.Println(err) }
keyword = strings.TrimSpace(strings.ToLower(out)) keyword = strings.ToLower(strings.Trim(DBC2SBC(strings.TrimSpace(out)),"\ufffc|,"))
return strings.Join(strings.Fields(keyword)," ") return strings.Join(strings.Fields(keyword)," ")
} }
...@@ -101,3 +70,22 @@ func strToInt(str string) int32 { ...@@ -101,3 +70,22 @@ func strToInt(str string) int32 {
if err != nil { fmt.Println(err) } if err != nil { fmt.Println(err) }
return int32(v) return int32(v)
} }
// 全角转半角
func DBC2SBC(s string) string {
var strLst []string
for _, i := range s {
insideCode := i
if insideCode == 12288 {
insideCode = 32
} else {
insideCode -= 65248
}
if insideCode < 32 || insideCode > 126 {
strLst = append(strLst, string(i))
} else {
strLst = append(strLst, string(insideCode))
}
}
return strings.Join(strLst, "")
}
\ No newline at end of file
...@@ -19,7 +19,6 @@ import ( ...@@ -19,7 +19,6 @@ import (
"time" "time"
"unicode" "unicode"
"unicode/utf8" "unicode/utf8"
"container/list"
) )
type Word struct { type Word struct {
...@@ -109,7 +108,7 @@ func main() { ...@@ -109,7 +108,7 @@ func main() {
if err != nil { log.Print(err.Error()) } if err != nil { log.Print(err.Error()) }
defer datawareDB.Close() defer datawareDB.Close()
fmt.Printf("Cost %d ms\n", time.Since(startTime).Nanoseconds()/1e6) fmt.Printf(" %s task finish Cost %d ms\n", dateStr, time.Since(startTime).Nanoseconds()/1e6)
} }
func checkUnusedData(bulkProcessor *elastic.BulkProcessor) { func checkUnusedData(bulkProcessor *elastic.BulkProcessor) {
...@@ -274,9 +273,6 @@ func queryIndex(idFlag int, db *sql.DB, bulkProcessor *elastic.BulkProcessor, wg ...@@ -274,9 +273,6 @@ func queryIndex(idFlag int, db *sql.DB, bulkProcessor *elastic.BulkProcessor, wg
if v, isExist := wordMap.Load(key); isExist { if v, isExist := wordMap.Load(key); isExist {
merge(w,v) merge(w,v)
} else if v, isExist := existSameWord(key); isExist {
fmt.Println("find same word, now is:" + w.Keyword + " exist is:" + v.(*Word).Keyword)
merge(w,v)
} }
addWord(w,bulkProcessor) addWord(w,bulkProcessor)
...@@ -288,28 +284,6 @@ func queryIndex(idFlag int, db *sql.DB, bulkProcessor *elastic.BulkProcessor, wg ...@@ -288,28 +284,6 @@ func queryIndex(idFlag int, db *sql.DB, bulkProcessor *elastic.BulkProcessor, wg
} }
// 以空格为分隔符分开的词,如果第一个词相同,其余的词 只有顺序差异,则视为相同的记录,需要合并
// 如 nike 男 鞋, nike 鞋 男。 第一个词相同,且总体词的集合相同,则合并
// 而 nike 男鞋 和 男鞋 nike 则不能合并(nike 男鞋 合并到 男鞋 nike 上,则减少了输入nike 时的可能提示 )
func existSameWord(keyword string) (value interface{}, ok bool) {
fields := strings.Fields(keyword)
length := len(fields)
if length == 1 {
return wordMap.Load(keyword)
}
if length > 5 {
return nil,false
}
lst := list.New()
permutation(fields,1,lst)
for p := lst.Front(); p != nil ; p = p.Next() {
if v, isExist := wordMap.Load(p.Value); isExist {
return v,true
}
}
return nil,false
}
func merge(word *Word, v interface{}) { func merge(word *Word, v interface{}) {
......
...@@ -3,12 +3,14 @@ package main ...@@ -3,12 +3,14 @@ package main
import ( import (
"sync" "sync"
"fmt" "fmt"
"github.com/liuzl/gocc"
) )
var tmap sync.Map var tmap sync.Map
var t2s1, _ = gocc.New("t2s")
func main() { func main() {
//var t2s, _ = gocc.New("t2s")
//var _, err = t2s.Convert("中國") //var _, err = t2s.Convert("中國")
//if err != nil { fmt.Println("succ")} //if err != nil { fmt.Println("succ")}
...@@ -31,9 +33,9 @@ func main() { ...@@ -31,9 +33,9 @@ func main() {
//} //}
add()
var val,_ = tmap.Load("a") var s = cleanKeyword("zhong ")
fmt.Print(val) fmt.Print(s)
} }
func add() { func add() {
tmap.Store("a","b") tmap.Store("a","b")
...@@ -41,11 +43,32 @@ func add() { ...@@ -41,11 +43,32 @@ func add() {
fmt.Print(val) fmt.Print(val)
} }
//// 求阶乘 //func cleanKeyword(keyword string) string {
//func factorial(n int64) int64 { // out, err := t2s1.Convert(keyword)
// if n == 1 { return 1} // if err != nil { fmt.Println(err) }
// return n * factorial(n-1) // keyword = strings.ToLower(strings.Trim(DBC2SBC(strings.TrimSpace(out)),"\ufffc|,"))
// return strings.Join(strings.Fields(keyword)," ")
//} //}
// //
//// 全角转半角
//func DBC2SBC(s string) string {
// var strLst []string
// for _, i := range s {
// insideCode := i
// if insideCode == 12288 {
// insideCode = 32
// } else {
// insideCode -= 65248
// }
// if insideCode < 32 || insideCode > 126 {
// strLst = append(strLst, string(i))
// } else {
// strLst = append(strLst, string(insideCode))
// }
// }
// return strings.Join(strLst, "")
//}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment