Commit 48ea4f19 by zhaoyanchao

提交代码,query 特征提取任务

parents
.idea/
*.iml
target/
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.secoo.search</groupId>
<artifactId>search-model-data</artifactId>
<version>1.0-SNAPSHOT</version>
<name>search-model-data</name>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.7</maven.compiler.source>
<maven.compiler.target>1.7</maven.compiler.target>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
<!-- hadoop -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.1</version>
<exclusions>
<exclusion>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
</exclusion>
<exclusion>
<groupId>commons-httpclient</groupId>
<artifactId>commons-httpclient</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpcore</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.1</version>
<exclusions>
<exclusion>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-api</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.16.6</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.5</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.alibaba/fastjson -->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.58</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.sparkjava/spark-core -->
<dependency>
<groupId>com.secoo.so.common</groupId>
<artifactId>common-data-api</artifactId>
<version>1.7.9</version>
</dependency>
<dependency>
<artifactId>sqp4j-client</artifactId>
<groupId>com.secoo</groupId>
<version>2.8.2.RELEASE</version>
<exclusions>
<exclusion>
<groupId>com.secoo.mall</groupId>
<artifactId>secoo-log</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-common</artifactId>
<version>1.11.0</version>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-encoding</artifactId>
<version>1.11.0</version>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-column</artifactId>
<version>1.11.0</version>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-hadoop</artifactId>
<version>1.11.0</version>
</dependency>
<dependency>
<groupId>com.secoo.so.common</groupId>
<artifactId>common-data-api</artifactId>
<version>1.9.4</version>
</dependency>
</dependencies>
<build>
<resources>
<resource>
<directory>src/main/resources</directory>
</resource>
</resources>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.6.0</version>
<configuration>
<source>1.7</source>
<target>1.7</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<!--<plugin>-->
<!--<groupId>org.apache.maven.plugins</groupId>-->
<!--<artifactId>maven-assembly-plugin</artifactId>-->
<!--<version>2.6</version>-->
<!--<configuration>-->
<!--<appendAssemblyId>false</appendAssemblyId>-->
<!--<descriptors>-->
<!--<descriptor>src/main/assembly/assembly.xml</descriptor>-->
<!--</descriptors>-->
<!--</configuration>-->
<!--<executions>-->
<!--<execution>-->
<!--<id>make-assembly</id>-->
<!--<phase>package</phase>-->
<!--<goals>-->
<!--<goal>single</goal>-->
<!--</goals>-->
<!--</execution>-->
<!--</executions>-->
<!--</plugin>-->
<!-- 打成 withDependencies jar 包-->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>2.6</version>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
<!--
- Copyright 1999-2011 Alibaba Group.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-->
<assembly xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.2"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.2
http://maven.apache.org/xsd/assembly-1.1.2.xsd">
<id>assembly</id>
<formats>
<format>tar.gz</format>
</formats>
<includeBaseDirectory>false</includeBaseDirectory>
<fileSets>
<fileSet>
<directory>src/main/assembly/scripts</directory>
<outputDirectory>bin</outputDirectory>
<fileMode>0755</fileMode>
<directoryMode>0755</directoryMode>
</fileSet>
</fileSets>
<dependencySets>
<dependencySet>
<outputDirectory>lib</outputDirectory>
</dependencySet>
</dependencySets>
</assembly>
\ No newline at end of file
package com.secoo.search;
import org.apache.commons.lang.StringUtils;
public class App {
public static void main(String[] args) {
System.out.println(StringUtils.isNumeric("1111122"));
}
}
package com.secoo.search.job.keyword;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* @author zhaoyanchao
*/
public class KeywordFeatureExtractJob extends Configured implements Tool {
private static final Logger LOG = LoggerFactory.getLogger(KeywordFeatureExtractJob.class);
public static void main(String[] args) {
try {
int ret = ToolRunner.run(new KeywordFeatureExtractJob(), args);
System.exit(ret);
} catch (Exception e) {
LOG.error(e.getMessage(), e);
}
}
@Override
public int run(String[] strings) throws Exception {
Job job = Job.getInstance();
Configuration conf = job.getConfiguration();
conf.set("mapreduce.job.maps", "30");
conf.set("mapreduce.job.reduces", "10");
job.setJobName("query-feature-extract-job");
job.setJarByClass(KeywordFeatureExtractJob.class);
job.setMapperClass(KeywordFeatureExtractMap.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setInputFormatClass(TextInputFormat.class);
FileInputFormat.addInputPath(job, new Path("hdfs://tesla-cluster/apps/hive/warehouse/secoo_search.db/search_data_original_query_last_year"));
job.setReducerClass(KeywordFeatureExtractReduce.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileOutputFormat.setOutputPath(job, new Path("hdfs://tesla-cluster/apps/hive/warehouse/secoo_search.db/search_data_query_original_feature"));
boolean success = job.waitForCompletion(true);
if (!success) {
System.out.println("job failed");
return -1;
}
return 0;
}
}
package com.secoo.search.job.keyword;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
*
* map 根据 keyword 作为key, 到 reduce 再处理
* @author zhaoyanchao
*/
public class KeywordFeatureExtractMap extends Mapper<Object, Text, Text, NullWritable> {
private static final int COLS = 1;
@Override
protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
String line = String.valueOf(value);
String[] items = line.trim().split("\t");
if (items.length == COLS){
if (StringUtils.isNotBlank(items[0]) && !filter(items[0])){
context.write(new Text(items[0].trim()), NullWritable.get());
}
}
}
/**
* 过滤掉部分query 词,如纯数字
* @param item
* @return
*/
private boolean filter(String item) {
if (StringUtils.isNumeric(item)) {
return true;
}
return false;
}
}
package com.secoo.search.job.keyword;
import com.secoo.so.common.constant.Environment;
import com.secoo.so.common.query.NormalQueryPlanServiceImpl;
import com.secoo.so.common.query.QueryFeature;
import com.secoo.so.common.query.QueryPlanService;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
/**
* @author zhaoyanchao
*/
public class KeywordFeatureExtractReduce extends Reducer<Text, NullWritable, NullWritable, Text> {
QueryPlanService planService;
private static final String FIELD_SPLITOR = "\t";
@Override
protected void setup(Context context) throws IOException, InterruptedException {
planService = new NormalQueryPlanServiceImpl(Environment.QUERY_PLAN_PROD);
}
@Override
protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
if (key == null ) {
return;
}
String keyword = key.toString();
if (StringUtils.isBlank(keyword)) {
return;
}
QueryFeature queryFeature = planService.extractQueryFeature(keyword);
String cateStr = getListStr(queryFeature.getCateIds(),5);
String brandStr = getListStr(queryFeature.getBrandIds(),3);
String record = new StringBuilder().append(keyword).append(FIELD_SPLITOR)
.append(cateStr).append(FIELD_SPLITOR)
.append(brandStr).append(FIELD_SPLITOR)
.append(queryFeature.getGender()).append(FIELD_SPLITOR)
.append(queryFeature.getContainsOtherWord()).append(FIELD_SPLITOR).toString();
context.write(NullWritable.get(),new Text(record.toString()));
// 避免对sqp 压力过大
Thread.sleep(5);
}
/** 输入list 可能为 null
* 可能为少于 5个的id
* */
private String getListStr(List<Long> oldCateIds,int num) {
List<Long> cateIds = new ArrayList<>();
if (oldCateIds == null) {
oldCateIds = Collections.emptyList();
}
for (int i = 0; i < num; ++i) {
if (i < oldCateIds.size() && oldCateIds.get(i) != null) {
cateIds.add(oldCateIds.get(i));
} else {
cateIds.add(0L);
}
}
return StringUtils.join(cateIds,"\t");
}
}
package com.secoo.search.testcommondata;
import com.secoo.so.common.constant.Environment;
import com.secoo.so.common.query.NormalQueryPlanServiceImpl;
import com.secoo.so.common.query.QueryPlanService;
public class TestQueryPlanInvoke {
public static void main(String[] args) {
QueryPlanService queryPlanService = new NormalQueryPlanServiceImpl(Environment.QUERY_PLAN_PROD);
queryPlanService.explain("lv 女包");
}
}
# 创建keyword 原始数据
hive -e "create external table if not exists secoo_search.search_data_original_query_last_year
(
keyword string comment '搜索词'
) comment '过去一年的搜索词'
row format delimited fields terminated by '\t'
stored as textfile;"
## 截取 p_day=xxxx-xx-xx 的最近时间
recent_keyword_day=`hive -e "show partitions secoo_app.app_search_keyword_year_week_p_day" |tail -n 2 | head -n 1`
recent_keyword_day=${recent_keyword_day:6:10}
echo $recent_keyword_day
# 提取keyword 到输入表
hive -e "insert overwrite table secoo_search.search_data_original_query_last_year
select M.keyword
from
(
select T.keyword,T.year_cnt
from (
select
keyword,
sum(year_pv) as year_cnt
from secoo_app.app_search_keyword_year_week_p_day
where p_day = '$recent_keyword_day'
group by keyword
) T
order by T.year_cnt desc limit 100000
) M;"
# 删除原输出文件
hdfs dfs -rm -r hdfs://tesla-cluster/apps/hive/warehouse/secoo_search.db/search_data_query_original_feature
# 提取特征到输出表
yarn jar search-model-data-1.0-SNAPSHOT-jar-with-dependencies.jar com.secoo.search.job.keyword.KeywordFeatureExtractJob
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment