Commit d2e9cd84 by xupeng

init project

parent 04ea135a
package com.secoo.so.suggest.config;
import com.alibaba.fastjson.JSON;
import com.secoo.so.suggest.util.StringUtils;
import lombok.extern.slf4j.Slf4j;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
......@@ -80,17 +76,16 @@ public class ConfigUtil {
return defaultValue;
}
private static List<String> prefixFilterList = null;
public static List<String> getPrefixFilterList() {
if (prefixFilterList == null) {
String val = getString("prefix_filter_list");
if (StringUtils.isNotBlank(val)) {
prefixFilterList = JSON.parseArray(val, String.class);
public static long getLong(String key, long defaultValue) {
String val = getString(key);
if (val != null) {
try {
return Long.parseLong(val);
} catch (Exception e) {
log.error(e.getMessage(), e);
}
prefixFilterList = new ArrayList<>();
}
return prefixFilterList;
return defaultValue;
}
}
package com.secoo.so.suggest.db;
import com.secoo.so.suggest.entity.SearchKeywordInfo;
import com.secoo.so.suggest.util.ObjectUtils;
import com.secoo.so.suggest.util.StringUtils;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.dbcp.BasicDataSource;
import java.io.IOException;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.Properties;
import java.util.*;
@Slf4j
public class DwDataSource {
......@@ -22,7 +27,7 @@ public class DwDataSource {
try {
prop.load(DwDataSource.class.getClassLoader().getResourceAsStream("db.properties"));
} catch (IOException e) {
log.error("init config error", e);
log.error("init db config error", e);
}
dataSource.setDriverClassName("com.mysql.jdbc.Driver");
dataSource.setUrl(prop.getProperty("dw.read.url"));
......@@ -53,4 +58,110 @@ public class DwDataSource {
log.error("close error", e);
}
}
public static Map<String, Long> querySearchWordCountAndMaxId() {
Map<String, Long> result = new HashMap<>();
Connection conn = DwDataSource.getConnection();
PreparedStatement stmt = null;
ResultSet rs = null;
try {
String sql = "select count(*) as cnt, max(id) as max_id, min(id) as min_id from app_search_keyword_year_week_p_day";
stmt = conn.prepareStatement(sql);
rs = stmt.executeQuery();
while (rs.next()) {
Long count = rs.getLong("cnt");
Long maxId = rs.getLong("max_id");
Long minId = rs.getLong("min_id");
result.put("count", count);
result.put("maxId", maxId);
result.put("minId", minId);
}
} catch (Exception e) {
log.error("querySearchWordCountAndMaxId error", e);
} finally {
ObjectUtils.safeClose(conn, stmt, rs);
}
return result;
}
/**
* 查询品牌信息
*/
public static List<SearchKeywordInfo> querySearchKeywordInfoList(long startId, long endId) {
List<SearchKeywordInfo> searchKeywordInfoList = new ArrayList<>();
Connection conn = DwDataSource.getConnection();
PreparedStatement stmt = null;
ResultSet rs = null;
try {
String sql = "select id, keyword, year_pv, year_product_click_count, year_add_cart_count, "
+ " week_pv, week_product_click_count, week_add_cart_count, p_day, "
+ " week_uv, week_product_click_uv, week_add_cart_uv, "
+ " month_pv, month_product_click_count, month_add_cart_count, month_uv,"
+ " month_product_click_uv, month_add_cart_uv, prepare_tags "
+ " from app_search_keyword_year_week_p_day where id >= ? and id < ?";
stmt = conn.prepareStatement(sql);
stmt.setLong(1, startId);
stmt.setLong(2, endId);
rs = stmt.executeQuery();
while (rs.next()) {
Long id = rs.getLong("id");
String keyword = rs.getString("keyword");
if (StringUtils.isBlank(keyword)) {
continue;
}
String prepareTags = rs.getString("prepare_tags");
Integer yearPv = rs.getInt("year_pv");
Integer yearProductClickCount = rs.getInt("year_product_click_count");
Integer yearAddCartCount = rs.getInt("year_add_cart_count");
Long weekPv = rs.getLong("week_pv");
Long weekProductClickCount = rs.getLong("week_product_click_count");
Long weekAddCartCount = rs.getLong("week_add_cart_count");
Long weekUv = rs.getLong("week_uv");
Long weekProductClickUv = rs.getLong("week_product_click_uv");
Long weekAddCartUv = rs.getLong("week_add_cart_uv");
Long monthPv = rs.getLong("month_pv");
Long monthProductClickCount = rs.getLong("month_product_click_count");
Long monthAddCartCount = rs.getLong("month_add_cart_count");
Long monthUv = rs.getLong("month_uv");
Long monthProductClickUv = rs.getLong("month_product_click_uv");
Long monthAddCartUv = rs.getLong("month_add_cart_uv");
String pDay = rs.getString("p_day");
SearchKeywordInfo searchKeywordInfo = new SearchKeywordInfo();
searchKeywordInfo.setId(id);
searchKeywordInfo.setKeyword(keyword);
searchKeywordInfo.setPrepareTags(prepareTags);
searchKeywordInfo.setYearPv(yearPv);
searchKeywordInfo.setYearProductClickCount(yearProductClickCount);
searchKeywordInfo.setYearAddCartCount(yearAddCartCount);
searchKeywordInfo.setWeekPv(weekPv);
searchKeywordInfo.setWeekProductClickCount(weekProductClickCount);
searchKeywordInfo.setWeekAddCartCount(weekAddCartCount);
searchKeywordInfo.setWeekUv(weekUv);
searchKeywordInfo.setWeekProductClickUv(weekProductClickUv);
searchKeywordInfo.setWeekAddCartUv(weekAddCartUv);
searchKeywordInfo.setMonthPv(monthPv);
searchKeywordInfo.setMonthProductClickCount(monthProductClickCount);
searchKeywordInfo.setMonthAddCartCount(monthAddCartCount);
searchKeywordInfo.setMonthUv(monthUv);
searchKeywordInfo.setMonthProductClickUv(monthProductClickUv);
searchKeywordInfo.setMonthAddCartUv(monthAddCartUv);
searchKeywordInfo.setPDay(pDay);
searchKeywordInfoList.add(searchKeywordInfo);
}
} catch (Exception e) {
log.error("querySearchKeywordInfoList error", e);
} finally {
ObjectUtils.safeClose(conn, stmt, rs);
}
return searchKeywordInfoList;
}
}
package com.secoo.so.suggest.db;
import com.secoo.so.suggest.entity.BrandInfo;
import com.secoo.so.suggest.entity.CategoryInfo;
import com.secoo.so.suggest.util.ObjectUtils;
import com.secoo.so.suggest.util.StringUtils;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.dbcp.BasicDataSource;
import java.io.IOException;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
@Slf4j
......@@ -53,4 +61,76 @@ public class ErpDataSource {
log.error("close error", e);
}
}
/**
* 查询品牌信息
*/
public static List<BrandInfo> queryBrandInfoList() {
List<BrandInfo> brandInfoList = new ArrayList<>();
Connection conn = ErpDataSource.getConnection();
PreparedStatement stmt = null;
ResultSet rs = null;
try {
String sql = "select id,en_name,ch_name,short_name,nickname from secooErpDB.t_product_brand where is_del = 0 and enabled = 1";
stmt = conn.prepareStatement(sql);
rs = stmt.executeQuery();
while (rs.next()) {
Long id = rs.getLong("id");
if (id == null || id <= 0) {
continue;
}
String enName = rs.getString("en_name");
String chName = rs.getString("ch_name");
String shortName = rs.getString("short_name");
String nickName = rs.getString("nickname");
BrandInfo brandInfo = new BrandInfo();
brandInfo.setId(id);
brandInfo.setEnName(enName);
brandInfo.setChName(chName);
brandInfo.setShortName(shortName);
brandInfo.setNickName(nickName);
brandInfoList.add(brandInfo);
}
} catch (Exception e) {
log.error("queryBrandInfoList error", e);
} finally {
ObjectUtils.safeClose(conn, stmt, rs);
}
return brandInfoList;
}
/**
* 查询品牌信息
*/
public static List<CategoryInfo> queryCategoryInfoList() {
List<CategoryInfo> categoryInfoList = new ArrayList<>();
Connection conn = ErpDataSource.getConnection();
PreparedStatement stmt = null;
ResultSet rs = null;
try {
String sql = "select id,name from secooErpDB.t_product_category where is_del = 0 and enabled = 1";
stmt = conn.prepareStatement(sql);
rs = stmt.executeQuery();
while (rs.next()) {
Long id = rs.getLong("id");
String name = rs.getString("name");
if (id == null || id <= 0 || StringUtils.isBlank(name)) {
continue;
}
CategoryInfo categoryInfo = new CategoryInfo();
categoryInfo.setId(id);
categoryInfo.setName(name);
categoryInfoList.add(categoryInfo);
}
} catch (Exception e) {
log.error("queryCategoryInfoList error", e);
} finally {
ObjectUtils.safeClose(conn, stmt, rs);
}
return categoryInfoList;
}
}
......@@ -20,7 +20,7 @@ public class SeoDataSource {
static {
Properties prop = new Properties();
try {
prop.load(ErpDataSource.class.getClassLoader().getResourceAsStream("db.properties"));
prop.load(SeoDataSource.class.getClassLoader().getResourceAsStream("db.properties"));
} catch (IOException e) {
log.error("init config error", e);
}
......
package com.secoo.so.suggest.db;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class SuggestTaskDao {
private static final Logger log = LoggerFactory.getLogger(SuggestTaskDao.class);
/**
* 查询
*/
// public static List<SearchRelateHotWordInfo> querySearchRelateInfo(SearchRelateHotWordInfo relateHotWordInfo){
// List<SearchRelateHotWordInfo> searchRelateHotWordInfoList = Lists.newArrayList();
// if (relateHotWordInfo == null){
// return searchRelateHotWordInfoList;
// }
// String brandCategoryId = relateHotWordInfo.getBrandCategoryId();
// if (StringUtils.isBlank(brandCategoryId)){
// return searchRelateHotWordInfoList;
// }
// Connection conn = SEODataSource.getConnection();
// PreparedStatement stmt = null;
// ResultSet rs = null;
// String sql = "SELECT category_code,category_code_name,brand_id,brand_name,brand_category_id FROM secooSeoDB.t_search_relate_hot_word WHERE brand_category_id =? and is_del = 0";
// try {
// stmt = conn.prepareStatement(sql);
// stmt.setString(1, brandCategoryId);
// rs = stmt.executeQuery();
// while (rs.next()){
// SearchRelateHotWordInfo searchRelateHotWordInfo = new SearchRelateHotWordInfo();
// String brandCategoryIdStr = rs.getString("brand_category_id");
// if (StringUtils.isBlank(brandCategoryIdStr)){
// continue;
// }
// searchRelateHotWordInfo.setBrandCategoryId(brandCategoryIdStr);
// searchRelateHotWordInfoList.add(searchRelateHotWordInfo);
// }
// }catch (Exception e){
// log.error("querySearchRelateInfo select exception", e);
// }finally {
// DBConnection.close(conn, stmt, rs);
// }
// return searchRelateHotWordInfoList;
// }
}
package com.secoo.so.suggest.entity;
import lombok.Data;
import java.io.Serializable;
/**
* 品牌信息
*/
@Data
public class BrandInfo implements Serializable {
private static final long serialVersionUID = -6388347520294644169L;
private Long id;
private String enName;
private String chName;
private String shortName;
private String nickName;
}
package com.secoo.so.suggest.entity;
import lombok.Data;
import java.io.Serializable;
/**
* 品类信息
*/
@Data
public class CategoryInfo implements Serializable {
private static final long serialVersionUID = -12528308204568143L;
private Long id;
private String name;
}
package com.secoo.so.suggest.entity;
import lombok.Data;
import java.io.Serializable;
@Data
public class EsSuggestKeywordInfo implements Serializable {
private static final long serialVersionUID = -2891215162084524117L;
private String keyword;
private String keywordPinYin;
private Integer yearCount;
private Integer yearClickCount;
private Integer yearCartCount;
private Long weekCount;
private Long weekClickCount;
private Long weekCartCount;
private Double yearClickRatio;
private Float yearCartRatio;
private Float weekClickRatio;
private Double weekCartRatio;
private Boolean isBrand;
private Boolean isCategory;
private Boolean isManual;
private Boolean isSensitive;
private Integer manualValue;
private Double wordRank;
private Float wordABRank;
private String keywordVersion;
private Boolean isEuropeWord;
private String suggestTags;
private Long updateTime;
}
package com.secoo.so.suggest.entity;
import lombok.Data;
import java.io.Serializable;
/**
* 搜索词信息
* app_search_keyword_year_week_p_day
*/
@Data
public class SearchKeywordInfo implements Serializable {
private static final long serialVersionUID = 5479160854636000122L;
private Long id;
private String keyword;
private String prepareTags;
private Integer yearPv;
private Integer yearProductClickCount;
private Integer yearAddCartCount;
private Long weekPv;
private Long weekProductClickCount;
private Long weekAddCartCount;
private Long weekUv;
private Long weekProductClickUv;
private Long weekAddCartUv;
private Long monthPv;
private Long monthProductClickCount;
private Long monthAddCartCount;
private Long monthUv;
private Long monthProductClickUv;
private Long monthAddCartUv;
private String pDay;
}
package com.secoo.so.suggest.task;
import com.alibaba.fastjson.JSON;
import com.secoo.so.suggest.config.ConfigUtil;
import com.secoo.so.suggest.db.DwDataSource;
import com.secoo.so.suggest.db.ErpDataSource;
import com.secoo.so.suggest.entity.BrandInfo;
import com.secoo.so.suggest.entity.CategoryInfo;
import com.secoo.so.suggest.entity.EsSuggestKeywordInfo;
import com.secoo.so.suggest.entity.SearchKeywordInfo;
import com.secoo.so.suggest.util.*;
import lombok.Data;
import lombok.extern.slf4j.Slf4j;
import java.io.File;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
/**
* 初始化suggest搜索词到es索引
*/
@Slf4j
public class SuggestTask {
private static Map<String, Long> brandMap = new HashMap<>(); // 品牌
private static Map<String, Long> categoryMap = new HashMap<>(); // 分类
private static Map<String, Integer> manualMap = new HashMap<>(); // 人工干预
private static Map<String, Boolean> sensitiveMap = new HashMap<>(); // 敏感词
private static Map<String, Boolean> europeWordMap = new HashMap<>(); // 欧洲
private static List<String> prefixFilterList = new ArrayList<>(); // 前缀过滤列表
private static int maxTagSize = 5;
private static long startTime = System.currentTimeMillis();
public static void main(String[] args) {
long start = System.currentTimeMillis();
startTime = System.currentTimeMillis();
log.info(">>>>>>>>>>>> start run SuggestTask");
// 初始化配置信息
ConfigUtil.init();
ConfigUtil.printAll();
prefixFilterList = loadPrefixFilterList();
maxTagSize = ConfigUtil.getInt("suggestTask.suggestTagMaxSize", 5);
// 加载品牌、品类信息
brandMap = loadBrandMap();
categoryMap = loadCategoryMap();
// 加载文件
manualMap = loadManualMap();
sensitiveMap = loadSensitiveMap();
europeWordMap = loadEuropeWordMap();
// 加载搜索词并处理
processSuggestTask(startTime);
log.info("<<<<<<<<<<<< end run SuggestTask, cost: {}ms", (System.currentTimeMillis() - startTime));
}
private static Map<String, Long> loadBrandMap() {
Map<String, Long> brandMap = new HashMap<>();
List<BrandInfo> brandList = ErpDataSource.queryBrandInfoList();
if (CollectionUtils.isNotEmpty(brandList)) {
for (BrandInfo brandInfo : brandList) {
putIfKeyNotBlank(brandMap, cleanKeyword(brandInfo.getEnName()), brandInfo.getId());
putIfKeyNotBlank(brandMap, cleanKeyword(brandInfo.getChName()), brandInfo.getId());
putIfKeyNotBlank(brandMap, cleanKeyword(brandInfo.getShortName()), brandInfo.getId());
putIfKeyNotBlank(brandMap, cleanKeyword(brandInfo.getNickName()), brandInfo.getId());
}
}
return brandMap;
}
private static Map<String, Long> loadCategoryMap() {
Map<String, Long> categoryMap = new HashMap<>();
List<CategoryInfo> categoryInfoList = ErpDataSource.queryCategoryInfoList();
if (CollectionUtils.isNotEmpty(categoryInfoList)) {
for (CategoryInfo categoryInfo : categoryInfoList) {
putIfKeyNotBlank(categoryMap, cleanKeyword(categoryInfo.getName()), categoryInfo.getId());
}
}
return categoryMap;
}
private static Map<String, Integer> loadManualMap() {
Map<String, Integer> manualMap = new HashMap<>();
String folderPath = ConfigUtil.getString("suggestTask.ManualFolder");
if (StringUtils.isNotBlank(folderPath)) {
File folder = new File(folderPath);
if (folder.exists() && folder.isDirectory()) {
for (File file : folder.listFiles()) {
log.info("load loadManualMap file: " + file.getAbsolutePath());
List<String> lines = FileUtils.readLines(file);
if (CollectionUtils.isNotEmpty(lines)) {
for (String line : lines) {
if (StringUtils.isNotBlank(line)) {
String[] lineSplit = line.split("\\|");
if (StringUtils.isNotBlank(lineSplit[0])) {
Integer value = (lineSplit.length > 1 ? StringUtils.str2Int(lineSplit[1]) : 1);
manualMap.put(cleanKeyword(lineSplit[0]), value != null ? value : 1);
}
}
}
}
}
}
}
return manualMap;
}
private static Map<String, Boolean> loadSensitiveMap() {
Map<String, Boolean> sensitiveMap = new HashMap<>();
String folderPath = ConfigUtil.getString("suggestTask.SensitiveFolder");
if (StringUtils.isNotBlank(folderPath)) {
File folder = new File(folderPath);
if (folder.exists() && folder.isDirectory()) {
for (File file : folder.listFiles()) {
log.info("load loadSensitiveMap file: " + file.getAbsolutePath());
List<String> lines = FileUtils.readLines(file);
if (CollectionUtils.isNotEmpty(lines)) {
for (String line : lines) {
if (StringUtils.isNotBlank(line)) {
sensitiveMap.put(cleanKeyword(line), true);
}
}
}
}
}
}
return sensitiveMap;
}
private static Map<String, Boolean> loadEuropeWordMap() {
Map<String, Boolean> europeWordMap = new HashMap<>();
String folderPath = ConfigUtil.getString("suggestTask.EuropeWordFolder");
if (StringUtils.isNotBlank(folderPath)) {
File folder = new File(folderPath);
if (folder.exists() && folder.isDirectory()) {
for (File file : folder.listFiles()) {
log.info("load loadEuropeWordMap file: " + file.getAbsolutePath());
List<String> lines = FileUtils.readLines(file);
if (CollectionUtils.isNotEmpty(lines)) {
for (String line : lines) {
if (StringUtils.isNotBlank(line)) {
europeWordMap.put(cleanKeyword(line), true);
}
}
}
}
}
}
return europeWordMap;
}
public static List<String> loadPrefixFilterList() {
List<String> prefixFilterList = new ArrayList<>();
String val = ConfigUtil.getString("suggestTask.prefixFilterList");
if (StringUtils.isNotBlank(val)) {
prefixFilterList = JSON.parseArray(val, String.class);
log.info("<<<<<<<<<<<< end run SuggestTask, cost: {}ms", (System.currentTimeMillis() - start));
}
return prefixFilterList;
}
private static String cleanKeyword(String keyword) {
if (keyword != null) {
String fixKeyword = PinYinUtils.convertToSimplifiedChinese(keyword);
fixKeyword = StringUtils.dbc2Sbc(fixKeyword).replaceAll("\ufffc|,|,|\\.", " ");
fixKeyword = StringUtils.cleanMultiBlank(fixKeyword);
return fixKeyword.toLowerCase();
}
return null;
}
private static void putIfKeyNotBlank(Map<String, Long> map, String key, Long value) {
if (map != null && StringUtils.isNotBlank(key)) {
map.put(key, value);
}
}
/**
* 处理suggest-task任务
*/
private static void processSuggestTask(long startTime) {
// 查询搜索词数量和最大id
Map<String, Long> countResultMap = DwDataSource.querySearchWordCountAndMaxId();
Long count = countResultMap.get("count");
Long maxId = countResultMap.get("maxId");
Long minId = countResultMap.get("minId");
log.info("querySearchWordCountAndMaxId: count={}, maxId={}", count, maxId);
Long warningCount = ConfigUtil.getLong("suggestTask.searchWordWarningCount", 1000000);
if (count < warningCount) {
log.warn("search word count is too little: count={}, warningCount={}, send warning", count, warningCount);
List<String> phones = StringUtils.splitToList(ConfigUtil.getString("suggestTask.warningPhones"), ",");
FeiShuUtil.sendMessage("suggest-task异常", "搜索词数量过低,不执行索引", phones);
return;
}
// 通过线程池分批次并发处理搜索词
long batchSize = ConfigUtil.getLong("suggestTask.batchSize", 10000);
int threadPoolSize = ConfigUtil.getInt("suggestTask.threadPoolSize", 10);
ExecutorService execThreadPool = Executors.newFixedThreadPool(threadPoolSize);
for (long startId = minId; startId <= maxId; startId = startId + batchSize) {
execThreadPool.submit(new SearchKeywordProcessTask(startId, startId + batchSize, startTime));
}
execThreadPool.shutdown();
while (true) {
if (execThreadPool.isTerminated()) {
log.info("所有的子线程都结束了, 关闭线程池成功");
break;
}
ObjectUtils.safeSleep(5000);
}
}
/**
* 处理搜索词
*/
private static void processSearchKeyword(List<SearchKeywordInfo> searchKeywordInfoList, long startTime) {
if (CollectionUtils.isNotEmpty(searchKeywordInfoList)) {
List<EsSuggestKeywordInfo> suggestKeywordInfoList = new ArrayList<>();
for (SearchKeywordInfo searchKeywordInfo : searchKeywordInfoList) {
if (StringUtils.isNotBlank(searchKeywordInfo.getKeyword())) {
EsSuggestKeywordInfo suggestKeywordInfo = new EsSuggestKeywordInfo();
suggestKeywordInfo.setKeyword(cleanKeyword(searchKeywordInfo.getKeyword()));
suggestKeywordInfo.setKeywordPinYin(PinYinUtils.changeToWithoutTonePinYin(suggestKeywordInfo.getKeyword(), ""));
suggestKeywordInfo.setYearCount(searchKeywordInfo.getYearPv());
suggestKeywordInfo.setYearClickCount(searchKeywordInfo.getYearProductClickCount());
suggestKeywordInfo.setYearCartCount(searchKeywordInfo.getYearAddCartCount());
suggestKeywordInfo.setWeekCount(searchKeywordInfo.getWeekPv());
suggestKeywordInfo.setWeekClickCount(searchKeywordInfo.getWeekProductClickCount());
suggestKeywordInfo.setWeekCartCount(searchKeywordInfo.getWeekAddCartCount());
suggestKeywordInfo.setSuggestTags(searchKeywordInfo.getPrepareTags());
suggestKeywordInfo.setKeywordVersion(searchKeywordInfo.getPDay());
suggestKeywordInfo.setUpdateTime(startTime);
suggestKeywordInfo.setIsBrand(brandMap.containsKey(suggestKeywordInfo.getKeyword()));
suggestKeywordInfo.setIsCategory(categoryMap.containsKey(suggestKeywordInfo.getKeyword()));
suggestKeywordInfo.setIsSensitive(sensitiveMap.containsKey(suggestKeywordInfo.getKeyword()));
suggestKeywordInfo.setIsEuropeWord(europeWordMap.containsKey(suggestKeywordInfo.getKeyword()));
suggestKeywordInfo.setIsManual(manualMap.containsKey(suggestKeywordInfo.getKeyword()));
suggestKeywordInfo.setManualValue(suggestKeywordInfo.getIsManual() ? manualMap.get(suggestKeywordInfo.getKeyword()) : 0);
// 计算suggestKeyword权重等属性
processEsSuggestKeywordInfo(suggestKeywordInfo, searchKeywordInfo);
// 不过滤的suggest词,计算分值写es
if (!isFilterSuggestKeyword(suggestKeywordInfo)) {
// 保存es前执行标签清洗
cleanBeforeSaveToEs(suggestKeywordInfo);
suggestKeywordInfoList.add(suggestKeywordInfo);
}
}
}
// 保存到es
// saveSuggestKeywordToEs(suggestKeywordInfoList);
// for test
saveSuggestKeywordToFile(suggestKeywordInfoList);
}
}
/**
* 保存到es
*/
private static void saveSuggestKeywordToEs(List<EsSuggestKeywordInfo> suggestKeywordInfoList) {
if (CollectionUtils.isNotEmpty(suggestKeywordInfoList)) {
}
}
private static void saveSuggestKeywordToFile(List<EsSuggestKeywordInfo> suggestKeywordInfoList) {
if (CollectionUtils.isNotEmpty(suggestKeywordInfoList)) {
List<String> lines = new ArrayList<>();
suggestKeywordInfoList.forEach(suggestKeywordInfo -> {
lines.add(JSON.toJSONString(suggestKeywordInfo));
});
String fileName = "/tmp/suggest-task/suggest_index-" + DateUtils.formatDate(startTime, "yyyyMMddHHmmss") + "-" + Thread.currentThread().getId() + ".json";
log.info("save result to file: " + fileName);
FileUtils.saveToFile(lines, fileName, true);
}
}
/**
* 是否需要过滤掉suggest关键词
*/
private static boolean isFilterSuggestKeyword(EsSuggestKeywordInfo suggestKeywordInfo) {
// 品牌词 类目词 人工干预词 不做过滤
if (suggestKeywordInfo.getIsBrand() || suggestKeywordInfo.getIsCategory() || suggestKeywordInfo.getIsManual()) {
suggestKeywordInfo.setIsSensitive(false);
return false;
}
// 敏感词过滤
if (suggestKeywordInfo.getIsSensitive()) {
return true;
}
// 过滤掉太长的词
if (suggestKeywordInfo.getKeyword().length() > 30) {
return true;
}
// 过滤掉纯数字的搜索词,原:过滤掉商品id,商品id是有7位数字组成
if (StringUtils.isNumber(suggestKeywordInfo.getKeyword())) {
return true;
}
// 前缀过滤
for (String prefix : prefixFilterList) {
if (suggestKeywordInfo.getKeyword().startsWith(prefix)) {
return true;
}
}
// 年数据过滤
if (suggestKeywordInfo.getYearCount() < 2 || suggestKeywordInfo.getYearClickCount() < 2) {
return true;
}
// 判断是否是热搜词 一年内搜索次数大于50或者一周内搜索次数大于5
if (isHotSearchWord(suggestKeywordInfo)) {
// 搜索次数比较多 转化率或者点击率较高的 不过滤
return !isHighCartRatio(suggestKeywordInfo);
} else {
// 搜索次数不多 但是转化率很高的 或者有加购 不过滤
return !isHighClickRatio(suggestKeywordInfo);
}
}
private static void cleanBeforeSaveToEs(EsSuggestKeywordInfo suggestKeywordInfo) {
if (suggestKeywordInfo != null) {
if (suggestKeywordInfo.getSuggestTags() == null || "null".equalsIgnoreCase(suggestKeywordInfo.getSuggestTags())) {
suggestKeywordInfo.setSuggestTags("");
return;
}
List<String> tagList = StringUtils.splitToList(suggestKeywordInfo.getSuggestTags(), ",");
if (tagList.size() > maxTagSize) {
suggestKeywordInfo.setSuggestTags(StringUtils.join(CollectionUtils.subList(tagList, 0, maxTagSize), ","));
}
}
}
private static boolean isHotSearchWord(EsSuggestKeywordInfo suggestKeywordInfo) {
return suggestKeywordInfo.getYearCount() > 50 || suggestKeywordInfo.getWeekCount() > 5;
}
private static boolean isHighCartRatio(EsSuggestKeywordInfo suggestKeywordInfo) {
return suggestKeywordInfo.getYearCartRatio() > 0.025 || suggestKeywordInfo.getWeekCartRatio() > 0.025
|| suggestKeywordInfo.getYearClickRatio() > 0.1 || suggestKeywordInfo.getWeekClickRatio() > 0.1;
}
private static boolean isHighClickRatio(EsSuggestKeywordInfo suggestKeywordInfo) {
if (suggestKeywordInfo.getYearCount() < 5 && suggestKeywordInfo.getYearClickRatio() < 0.6 && suggestKeywordInfo.getYearCartCount() == 0) {
return false;
}
return suggestKeywordInfo.getYearClickRatio() > 0.2 || suggestKeywordInfo.getWeekClickRatio() > 0.2 || suggestKeywordInfo.getYearCartCount() >= 1;
}
private static void processEsSuggestKeywordInfo(EsSuggestKeywordInfo suggestKeywordInfo, SearchKeywordInfo searchKeywordInfo) {
// 年点击加购率
suggestKeywordInfo.setYearClickRatio(CalculateUtils.calculateRatio(suggestKeywordInfo.getYearClickCount(), suggestKeywordInfo.getYearCount()));
suggestKeywordInfo.setYearCartRatio(CalculateUtils.calculateRatio(suggestKeywordInfo.getYearCartCount(), suggestKeywordInfo.getYearCount()).floatValue());
// 周点击加购率
suggestKeywordInfo.setWeekClickRatio(CalculateUtils.calculateRatio(suggestKeywordInfo.getWeekClickCount().intValue(), suggestKeywordInfo.getWeekCount().intValue()).floatValue());
suggestKeywordInfo.setWeekCartRatio(CalculateUtils.calculateRatio(suggestKeywordInfo.getWeekCartCount().intValue(), suggestKeywordInfo.getWeekCount().intValue()));
// 年加购率 再加权
if (suggestKeywordInfo.getYearCount() != 0 && suggestKeywordInfo.getYearCartCount() != 0) {
suggestKeywordInfo.setYearCartRatio(suggestKeywordInfo.getYearCartRatio() * 3);
}
// 周加购率 再加权
if (suggestKeywordInfo.getWeekCount() != 0 && suggestKeywordInfo.getWeekCartCount() != 0) {
suggestKeywordInfo.setWeekCartRatio(suggestKeywordInfo.getWeekCartRatio() * 3);
}
// 周点击率 再加权
if (suggestKeywordInfo.getWeekCount() != 0 && suggestKeywordInfo.getWeekClickCount() != 0) {
suggestKeywordInfo.setWeekClickRatio(suggestKeywordInfo.getWeekClickRatio() * 2);
}
calculateWordRank(suggestKeywordInfo);
calculateWordABRank(suggestKeywordInfo, searchKeywordInfo);
addNewScoreIfNewHotWord(suggestKeywordInfo);
}
private static void calculateWordRank(EsSuggestKeywordInfo suggestKeywordInfo) {
Double wordRank = 10000.0;
// 长度因子
wordRank += 3000 * CalculateUtils.calculateLengthFactor(suggestKeywordInfo.getKeyword().length());
// 年数量因子
wordRank += 2000 * CalculateUtils.calculateCountFactor(suggestKeywordInfo.getYearCount(), 1);
// 周数量因子
wordRank += 2000 * CalculateUtils.calculateCountFactor(suggestKeywordInfo.getWeekCount().intValue(), 52);
// 年点击率因子
wordRank += 3000 * CalculateUtils.calculateRatioFactor(suggestKeywordInfo.getYearClickRatio(), suggestKeywordInfo.getYearClickCount());
// 周点击率因子
wordRank += 3000 * CalculateUtils.calculateRatioFactor(suggestKeywordInfo.getWeekClickRatio().doubleValue(), suggestKeywordInfo.getWeekClickCount().intValue());
// 年加购率因子
wordRank += 3000 * CalculateUtils.calculateRatioFactor(suggestKeywordInfo.getYearCartRatio().doubleValue(), suggestKeywordInfo.getYearCartCount());
// 周加购率因子
wordRank += 3000 * CalculateUtils.calculateRatioFactor(suggestKeywordInfo.getWeekCartRatio(), suggestKeywordInfo.getWeekCartCount().intValue());
if (suggestKeywordInfo.getIsBrand()) {
wordRank *= 1.8;
}
if (suggestKeywordInfo.getIsCategory()) {
wordRank *= 1.2;
}
if (suggestKeywordInfo.getIsManual() && suggestKeywordInfo.getManualValue() > 0) {
wordRank *= Math.sqrt(suggestKeywordInfo.getManualValue() * 1.0);
}
suggestKeywordInfo.setWordRank(wordRank);
}
private static void calculateWordABRank(EsSuggestKeywordInfo suggestKeywordInfo, SearchKeywordInfo searchKeywordInfo) {
// 月点击加购率
Double monthClickRatio = CalculateUtils.calculateRatio(searchKeywordInfo.getMonthProductClickUv().intValue(), searchKeywordInfo.getMonthUv().intValue());
Double monthCartRatio = CalculateUtils.calculateRatio(searchKeywordInfo.getMonthAddCartUv().intValue(), searchKeywordInfo.getMonthUv().intValue());
// 周点击加购率(和A相比, count 换成了uv)
Double weekClickRatioNew = CalculateUtils.calculateRatio(searchKeywordInfo.getWeekProductClickUv().intValue(), searchKeywordInfo.getWeekUv().intValue());
Double weekCartRatioNew = CalculateUtils.calculateRatio(searchKeywordInfo.getWeekAddCartUv().intValue(), searchKeywordInfo.getWeekUv().intValue());
// 月点击
if (searchKeywordInfo.getMonthProductClickUv() != 0 && searchKeywordInfo.getMonthUv() != 0) {
monthClickRatio *= 1.5;
}
// 月加购,加权
if (searchKeywordInfo.getMonthAddCartUv() != 0 && searchKeywordInfo.getMonthUv() != 0) {
monthCartRatio *= 3;
}
// 周点击,加权
if (searchKeywordInfo.getWeekProductClickUv() != 0 && searchKeywordInfo.getWeekUv() != 0) {
weekClickRatioNew *= 2;
}
// 周加购,加权
if (searchKeywordInfo.getWeekAddCartUv() != 0 && searchKeywordInfo.getWeekUv() != 0) {
weekCartRatioNew *= 3;
}
Double wordABRank = 10000.0;
// 长度因子
wordABRank += 3000 * CalculateUtils.calculateLengthFactor(suggestKeywordInfo.getKeyword().length());
// 月数量因子
wordABRank += 2000 * CalculateUtils.calculateCountFactor(searchKeywordInfo.getMonthUv().intValue(), 4);
// 周数量因子
wordABRank += 2000 * CalculateUtils.calculateCountFactor(searchKeywordInfo.getWeekUv().intValue(), 52);
// 年数量因子
wordABRank += 2000 * CalculateUtils.calculateCountFactor(suggestKeywordInfo.getYearCount(), 1);
// 点击
// 月点击率因子
wordABRank += 3000 * CalculateUtils.calculateRatioFactor(monthClickRatio, searchKeywordInfo.getMonthProductClickUv().intValue());
// 周点击率因子
wordABRank += 3000 * CalculateUtils.calculateRatioFactor(weekClickRatioNew, searchKeywordInfo.getWeekUv().intValue());
// 加购
// 年加购率因子
wordABRank += 3000 * CalculateUtils.calculateRatioFactor(suggestKeywordInfo.getYearCartRatio().doubleValue(), suggestKeywordInfo.getYearCartCount());
// 月加购率因子
wordABRank += 3000 * CalculateUtils.calculateRatioFactor(monthCartRatio, searchKeywordInfo.getMonthUv().intValue());
// 周加购率因子
wordABRank += 3000 * CalculateUtils.calculateRatioFactor(weekCartRatioNew, searchKeywordInfo.getWeekUv().intValue());
if (suggestKeywordInfo.getIsBrand()) {
wordABRank *= 1.8;
}
if (suggestKeywordInfo.getIsCategory()) {
wordABRank *= 1.2;
}
if (suggestKeywordInfo.getIsManual() && suggestKeywordInfo.getManualValue() > 0) {
wordABRank *= Math.sqrt(suggestKeywordInfo.getManualValue() * 1.0);
}
suggestKeywordInfo.setWordABRank(wordABRank.floatValue());
}
private static void addNewScoreIfNewHotWord(EsSuggestKeywordInfo suggestKeywordInfo) {
// 比例有意义
if (suggestKeywordInfo.getWeekCount() == 0 || suggestKeywordInfo.getYearCount() == 0 || suggestKeywordInfo.getWeekCount() < 20) {
return;
}
// 周点击占年点击 40% 以上
if (suggestKeywordInfo.getWeekCount() * 10 / suggestKeywordInfo.getYearCount() <= 5) {
return;
}
if (suggestKeywordInfo.getWeekClickCount() < 3 || suggestKeywordInfo.getWeekCount() < 5) {
return;
}
// 新词加分大小 类似于 人工干预值
suggestKeywordInfo.setWordABRank(new Double(suggestKeywordInfo.getWordABRank() * Math.sqrt(5.0)).floatValue());
}
@Data
static class SearchKeywordProcessTask implements Runnable, Serializable {
private static final long serialVersionUID = -2853856815712590673L;
public SearchKeywordProcessTask(Long startId, Long endId, Long startTime) {
this.startId = startId;
this.endId = endId;
this.startTime = startTime;
}
private Long startId;
private Long endId;
private Long startTime;
@Override
public void run() {
List<SearchKeywordInfo> searchKeywordInfoList = DwDataSource.querySearchKeywordInfoList(startId, endId);
if (CollectionUtils.isNotEmpty(searchKeywordInfoList)) {
processSearchKeyword(searchKeywordInfoList, startTime);
}
}
}
}
package com.secoo.so.suggest.util;
/**
* @author xupeng
* @date: 2022/1/27
*/
public class CalculateUtils {
public static Double calculateRatio(Integer numerator, Integer denominator) {
if (numerator == null || numerator == 0 || denominator == null || numerator == 0) {
return 0D;
}
return numerator.doubleValue() / denominator.doubleValue();
}
public static Double calculateLengthFactor(Integer length) {
//根据文本长度转换为长度因子
return 1.0 / new Double(2 * length + 1);
}
public static Double calculateRatioFactor(Double ratio, Integer count) {
Double rank = 1.0;
if (count > 1 && count < 10) {
rank = 1.2;
} else if (count >= 10 && count < 20) {
rank = 1.4;
} else if (count >= 20 && count < 50) {
rank = 1.6;
} else if (count >= 50 && count < 100) {
rank = 1.8;
} else if (count >= 100 && count < 200) {
rank = 2.0;
} else if (count >= 200 && count < 500) {
rank = 2.2;
} else if (count >= 500) {
rank = 2.5;
}
//根据搜索转化率,转换为热度因子
return Math.log10(Math.sqrt(ratio + 10)) * rank;
}
public static Double calculateCountFactor(Integer count, Integer rank) {
//根据搜索次数,转换为热度因子
count = count * rank + 10;
return Math.log10(Math.sqrt(new Double(count)));
}
}
......@@ -337,8 +337,8 @@ public abstract class CollectionUtils {
public static <T, A> void putValueToMapWithList(Map<A, List<T>> result, A key, T t, boolean checkValueExistsInList) {
if (result.containsKey(key)) {
List<T> list = result.get(key);
if(checkValueExistsInList){
if(list.contains(t)){
if (checkValueExistsInList) {
if (list.contains(t)) {
return;
}
}
......@@ -454,6 +454,17 @@ public abstract class CollectionUtils {
return result;
}
public static <T> List<T> subList(List<T> list, int fromIndex, int toIndex) {
if (list != null) {
List<T> subList = new ArrayList<>();
for (int i = fromIndex; i < list.size() && i < toIndex; i++) {
subList.add(list.get(i));
}
return subList;
}
return null;
}
public static void main(String[] args) {
}
......
package com.secoo.so.suggest.util;
import java.text.SimpleDateFormat;
import java.util.Date;
public class DateUtils {
public static final String DEFAULT_DATE_FORMAT = "yyyy-MM-dd";
public static final String DEFAULT_DATETIME_FORMAT = "yyyy-MM-dd HH:mm:ss";
public static String currentDate(String format) {
return new SimpleDateFormat(format).format(new Date());
}
public static String currentDate() {
return currentDate(DEFAULT_DATE_FORMAT);
}
public static String currentDatetime(String format) {
return new SimpleDateFormat(format).format(new Date());
}
public static String currentDatetime() {
return currentDatetime(DEFAULT_DATETIME_FORMAT);
}
public static String formatDate(long ms) {
return formatDate(new Date(ms));
}
public static String formatDate(long ms, String format) {
return formatDate(new Date(ms), format);
}
public static String formatDate(Date date) {
return new SimpleDateFormat(DEFAULT_DATETIME_FORMAT).format(date);
}
public static String formatDate(Date date, String format) {
return new SimpleDateFormat(format).format(date);
}
}
package com.secoo.so.suggest.util;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
public class FileUtils {
private static Logger log = LoggerFactory.getLogger(FileUtils.class);
/**
* 创建文件
*
* @param fileName 文件名字
* @return File实例
*/
public static File createFile(String fileName) {
File file = new File(fileName);
if (!file.getParentFile().exists()) {
file.getParentFile().mkdirs();
}
if (file.exists()) {
// throw new RuntimeException("文件已存在:" + fileName);
// 文件存在则删除原有文件,直接覆盖
file.delete();
file = new File(fileName);
}
return file;
}
public static void delFile(File file) {
if (file.exists()) {
if (file.isFile()) {
file.delete();
} else {
File[] files = file.listFiles();
for (File f : files) {
delFile(f);
}
file.delete();
}
}
}
public static void delFile(String path) {
delFile(new File(path));
}
/**
* copy 文件
*
* @param sourceFile
* @param targetFile
* @throws IOException
*/
public static void copyFile(File sourceFile, File targetFile) throws IOException {
saveFile(new FileInputStream(sourceFile), targetFile);
}
public static void saveFile(InputStream inputStream, File targetFile) throws IOException {
BufferedInputStream inBuff = null;
BufferedOutputStream outBuff = null;
try {
// 新建文件输入流并对它进行缓冲
inBuff = new BufferedInputStream(inputStream);
// 新建文件输出流并对它进行缓冲
outBuff = new BufferedOutputStream(new FileOutputStream(targetFile));
// 缓冲数组
byte[] b = new byte[1024 * 5];
int len;
while ((len = inBuff.read(b)) != -1) {
outBuff.write(b, 0, len);
}
// 刷新此缓冲的输出流
outBuff.flush();
} finally {
// 关闭流
if (inBuff != null)
inBuff.close();
if (outBuff != null)
outBuff.close();
}
}
public static void saveFile(InputStream inputStream, String targetFile) throws IOException {
saveFile(inputStream, createFile(targetFile));
}
private void saveToFile(List<String> lines, String fileName) {
saveToFile(lines, fileName, true);
}
private void saveToFile(String content, String fileName) {
saveToFile(content, fileName, true);
}
public static void saveToFile(List<String> lines, String fileName, boolean append) {
saveToFile(lines, new File(fileName), append);
}
public static void saveToFile(String content, String fileName, boolean append) {
saveToFile(content, new File(fileName), append);
}
public static void saveToFile(List<String> lines, File file, boolean append) {
if (CollectionUtils.isNotEmpty(lines)) {
StringBuilder sBuilder = new StringBuilder();
lines.forEach(line -> {
sBuilder.append(line).append("\n");
});
saveToFile(sBuilder.toString(), file, append);
}
}
public static void saveToFile(String content, File file, boolean append) {
FileWriter fw = null;
BufferedWriter bw = null;
try {
if (!file.getParentFile().exists()) {
file.getParentFile().mkdirs();
}
fw = new FileWriter(file, append);
bw = new BufferedWriter(fw);
bw.write(content);
} catch (IOException e) {
log.error(e.getMessage(), e);
} catch (Exception e) {
log.error(e.getMessage(), e);
} finally {
// 完毕,关闭所有链接
try {
bw.close();
fw.close();
} catch (IOException e) {
log.error(e.getMessage(), e);
}
}
}
public static File getFile(String fileName, String propertyName) throws Exception {
String filePath = null;
if (propertyName != null && !"".equals(propertyName)) {
filePath = System.getProperty(propertyName);
}
File file = null;
if (filePath == null || "".equals(filePath)) {
URL url = FileUtils.class.getClassLoader().getResource(propertyName + fileName);
if (url == null) {
throw new FileNotFoundException(fileName + " not found!");
}
file = new File(url.getPath());
} else {
filePath = filePath.endsWith("/") ? filePath.concat(fileName) : filePath.concat("/").concat(fileName);
file = new File(filePath);
}
return file;
}
public static String getFilePath(String fileName, String propertyName) throws Exception {
String filePath = null;
if (propertyName != null && !"".equals(propertyName)) {
filePath = System.getProperty(propertyName);
}
if (filePath == null || "".equals(filePath)) {
URL url = FileUtils.class.getClassLoader().getResource(propertyName + fileName);
if (url == null) {
throw new FileNotFoundException(fileName + " not found!");
}
filePath = url.getPath();
} else {
filePath = filePath.endsWith("/") ? filePath.concat(fileName) : filePath.concat("/").concat(fileName);
}
return filePath;
}
public static String getFileDir(String fileName, String propertyName) throws Exception {
String filePath = null;
if (propertyName != null && !"".equals(propertyName)) {
filePath = System.getProperty(propertyName);
}
if (filePath == null || "".equals(filePath)) {
URL url = FileUtils.class.getClassLoader().getResource(propertyName + fileName);
if (url == null) {
throw new FileNotFoundException(fileName + " not found!");
}
filePath = url.getPath();
filePath = filePath.replace(fileName, "");
} else {
filePath = filePath.endsWith("/") ? filePath.concat(fileName) : filePath.concat("/").concat(fileName);
}
return filePath;
}
/**
* @param file
*/
public static String read(File file, String charset) {
final byte[] content = read(file);
return content == null ? "" : new String(content);
}
public static byte[] read(File file) {
if (!(file.exists() && file.isFile())) {
throw new IllegalArgumentException("The remote not exist or not a remote");
}
FileInputStream fis = null;
byte[] content = null;
try {
fis = new FileInputStream(file);
content = new byte[fis.available()];
fis.read(content);
} catch (FileNotFoundException e) {
log.error(e.getMessage(), e);
} catch (IOException e) {
log.error(e.getMessage(), e);
} finally {
if (fis != null) {
try {
fis.close();
} catch (IOException e) {
log.error(e.getMessage(), e);
}
fis = null;
}
}
return content;
}
/**
* 将saveProperties保存为文件
*
* @param filePath
* @param parameterName
* @param parameterValue
*/
public static void saveProperties(String filePath, String parameterName, String parameterValue) {
Properties prop = new Properties();
try {
InputStream fis = new FileInputStream(filePath);
prop.load(fis);
OutputStream fos = new FileOutputStream(filePath);
prop.setProperty(parameterName, parameterValue);
prop.store(fos, "Update '" + parameterName + "' value");
fis.close();
} catch (IOException e) {
System.err.println("Visit " + filePath + " for updating " + parameterName + " value error");
}
}
/**
* 读取文件
*
* @param inputStream 文件流
* @return
* @author shaoqiang.guo
*/
public static String readFile(InputStream inputStream) {
BufferedInputStream in = new BufferedInputStream(inputStream);
ByteArrayOutputStream out = new ByteArrayOutputStream(1024);
byte[] temp = new byte[1024];
int size = 0;
try {
while ((size = in.read(temp)) != -1) {
out.write(temp, 0, size);
}
} catch (IOException e) {
throw new RuntimeException("read file error.", e);
} finally {
try {
in.close();
} catch (IOException e) {
throw new RuntimeException("close stream error.", e);
}
}
byte[] content = out.toByteArray();
return new String(content);
}
// 通过url下载文件保存到本地
public static void download(String urlString, String fileName) throws Exception {
// 构造URL
URL url = new URL(urlString);
// 打开连接
URLConnection con = url.openConnection();
con.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
con.setRequestProperty("sec-ch-ua", "\"Google Chrome\";v=\"93\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"93\"");
con.setRequestProperty("cache-control", "no-cache");
con.setRequestProperty("access-control-expose-headers", "x-ak-country-code");
// 输入流
InputStream is = con.getInputStream();
// 1K的数据缓冲
byte[] bs = new byte[1024];
// 读取到的数据长度
int len;
// 输出的文件流
File file = new File(fileName);
FileOutputStream os = new FileOutputStream(file, true);
// 开始读取
while ((len = is.read(bs)) != -1) {
os.write(bs, 0, len);
}
// 完毕,关闭所有链接
os.close();
is.close();
}
// 通过url下载文件保存到本地
public static void download2(String urlString, String fileName) throws Exception {
// 构造URL
URL url = new URL(urlString);
HttpURLConnection httpURLConnection = (HttpURLConnection) new URL(urlString).openConnection();
httpURLConnection.setRequestMethod("GET");
httpURLConnection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36");
httpURLConnection.setRequestProperty("Accept-Encoding", "gzip");
httpURLConnection.setRequestProperty("Referer", "no-referrer");
httpURLConnection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded");
httpURLConnection.setConnectTimeout(15000);
httpURLConnection.setReadTimeout(20000);
// 输入流
InputStream is = httpURLConnection.getInputStream();
// 1K的数据缓冲
byte[] bs = new byte[1024];
// 读取到的数据长度
int len;
// 输出的文件流
File file = new File(fileName);
FileOutputStream os = new FileOutputStream(file, true);
// 开始读取
while ((len = is.read(bs)) != -1) {
os.write(bs, 0, len);
}
// 完毕,关闭所有链接
os.close();
is.close();
}
public static List<String> readLines(File file) {
List<String> lines = new ArrayList<>();
try {
// 1. .csv文件的路径。注意只有一个\的要改成
BufferedReader br = new BufferedReader(new FileReader(file));
String line;
// 读取到的内容给line变量
while ((line = br.readLine()) != null) {
if (StringUtils.isNotBlank(line)) {
lines.add(line);
}
}
} catch (Exception e) {
log.error("readLines from file<{}> error:", e);
}
return lines;
}
public static List<String> readLines(String fileName) {
return readLines(new File(fileName));
}
}
......@@ -2004,11 +2004,10 @@ public abstract class StringUtils {
/**
* 逗号等分隔符转空格,去两边空格,中间多个空格转一个空格
*/
public static String cleanStr(String str) {
public static String cleanMultiBlank(String str) {
// 去两边空格,转小写,中间多个空格转一个空格
if (str != null) {
String clean = str
.replaceAll(",|,|、|;|;", " ")
.trim()
.replaceAll("\\s{1,}", " ");
return clean;
......
sqp.service.name=sqb4j
sqp.service.version=1.4
sqp.service.level=product
sqp.zookeeper.hosts=center1.secoo-inc.com:2181,center2.secoo-inc.com:2181,center3.secoo-inc.com:2181
image.delete.url=http://10.16.9.28:8080
image.update.url=http://10.16.8.44:8080
prefixFilterList=["https://", "http://", "dg", "d & g", "dolce&gabbana","dolce & gabbana", "\u675C\u5609\u73ED\u7EB3", "\u907F\u5B55", "\u60C5\u8DA3", "cucci", "\u4E52\u4E53\u7403", "cuccl", "gucii","tod's","iwc7"]
ManualFolder=/data/pssmaster/corpus_set/suggest_corpus/manual
SensitiveFolder=/data/pssmaster/corpus_set/suggest_corpus/sensitive
EuropeWordFolder=/data/pssmaster/corpus_set/suggest_corpus/europe_word
# suggestTask
suggestTask.prefixFilterList=["https://", "http://", "dg", "d & g", "dolce&gabbana","dolce & gabbana", "\u675C\u5609\u73ED\u7EB3", "\u907F\u5B55", "\u60C5\u8DA3", "cucci", "\u4E52\u4E53\u7403", "cuccl", "gucii","tod's","iwc7"]
suggestTask.ManualFolder=/data/pssmaster/corpus_set/suggest_corpus/manual
suggestTask.SensitiveFolder=/data/pssmaster/corpus_set/suggest_corpus/sensitive
suggestTask.EuropeWordFolder=/data/pssmaster/corpus_set/suggest_corpus/europe_word
suggestTask.batchSize=10000
suggestTask.threadPoolSize=10
suggestTask.searchWordWarningCount=1000000
suggestTask.suggestTagMaxSize=5
suggestTask.warningPhones=13426233960
sqp.service.name=sqb4j
sqp.service.version=1.4
sqp.service.level=product
sqp.zookeeper.hosts=center1.secoo-inc.com:2181,center2.secoo-inc.com:2181,center3.secoo-inc.com:2181
image.delete.url=http://192.168.70.141:8080
\ No newline at end of file
# suggestTask
suggestTask.prefixFilterList=["https://", "http://", "dg", "d & g", "dolce&gabbana","dolce & gabbana", "\u675C\u5609\u73ED\u7EB3", "\u907F\u5B55", "\u60C5\u8DA3", "cucci", "\u4E52\u4E53\u7403", "cuccl", "gucii","tod's","iwc7"]
suggestTask.ManualFolder=/data/pssmaster/corpus_set/suggest_corpus/manual
suggestTask.SensitiveFolder=/data/pssmaster/corpus_set/suggest_corpus/sensitive
suggestTask.EuropeWordFolder=/data/pssmaster/corpus_set/suggest_corpus/europe_word
suggestTask.batchSize=10000
suggestTask.threadPoolSize=10
suggestTask.suggestTagMaxSize=5
suggestTask.searchWordWarningCount=1000000
\ No newline at end of file
{
"state": "open",
"settings": {
"index": {
"number_of_shards": "1",
"provided_name": "search_suggest_index",
"creation_date": "1551702662623",
"analysis": {
"analyzer": {
"suggest_analyzer": {
"tokenizer": "suggest_tokenizer"
}
},
"tokenizer": {
"suggest_tokenizer": {
"type": "edge_ngram",
"min_gram": "1",
"max_gram": "20"
}
}
},
"number_of_replicas": "2",
"uuid": "GdxvBgzsSICrpSddf6bqIQ",
"version": {
"created": "6040099"
}
}
},
"mappings": {
"search_suggest_type": {
"properties": {
"isEuropeWord": {
"type": "boolean"
},
"yearCount": {
"type": "integer"
},
"yearCartRatio": {
"type": "double"
},
"weekClickRatio": {
"type": "double"
},
"weekCount": {
"type": "integer"
},
"wordABRank": {
"type": "float"
},
"IsEuropeWord": {
"type": "boolean"
},
"analyzer": {
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"weekClickCount": {
"type": "integer"
},
"text": {
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"keyword": {
"analyzer": "suggest_analyzer",
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"isManual": {
"type": "boolean"
},
"keywordPinYin": {
"analyzer": "suggest_analyzer",
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"query": {
"properties": {
"bool": {
"properties": {
"must": {
"properties": {
"term": {
"properties": {
"keywordVersion": {
"type": "date"
}
}
}
}
}
}
}
}
},
"weekCartRatio": {
"type": "double"
},
"yearClickCount": {
"type": "integer"
},
"updateTime": {
"type": "long"
},
"yearCartCount": {
"type": "integer"
},
"keywordVersion": {
"type": "keyword"
},
"yearClickRatio": {
"type": "double"
},
"isCategory": {
"type": "boolean"
},
"field": {
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"isSensitive": {
"type": "boolean"
},
"suggestTags": {
"type": "keyword"
},
"weekCartCount": {
"type": "integer"
},
"doc": {
"properties": {
"isManual": {
"type": "boolean"
},
"keywordPinYin": {
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"weekCartRatio": {
"type": "float"
},
"yearClickCount": {
"type": "long"
},
"updateTime": {
"type": "long"
},
"yearCartCount": {
"type": "long"
},
"yearCount": {
"type": "long"
},
"yearCartRatio": {
"type": "float"
},
"weekClickRatio": {
"type": "float"
},
"weekCount": {
"type": "long"
},
"wordABRank": {
"type": "float"
},
"keywordVersion": {
"type": "date"
},
"yearClickRatio": {
"type": "float"
},
"isCategory": {
"type": "boolean"
},
"isSensitive": {
"type": "boolean"
},
"suggestTags": {
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"weekClickCount": {
"type": "long"
},
"weekCartCount": {
"type": "long"
},
"wordRank": {
"type": "float"
},
"keyword": {
"type": "text",
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
}
},
"manualValue": {
"type": "long"
},
"isBrand": {
"type": "boolean"
}
}
},
"wordRank": {
"type": "double"
},
"manualValue": {
"type": "integer"
},
"isBrand": {
"type": "boolean"
}
}
}
},
"aliases": [
],
"primary_terms": {
"0": 3
},
"in_sync_allocations": {
"0": [
"rmaxShfDRkCpdv91Iz4nkQ",
"tWXAarrcTQmXvB07MuWLYg",
"lcW9Sv9MTgSkKb9XmmUzOQ"
]
}
}
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment