提交代码，query 特征提取任务

48ea4f19 · zhaoyanchao · 48ea4f19 · 48ea4f19 · 48ea4f19 · 48ea4f19
Commit 48ea4f19 authored Apr 29, 2021 by zhaoyanchao
9 changed files
--- a/.gitignore
+++ b/.gitignore
+.idea/
+*.iml
+target/
--- a/pom.xml
+++ b/pom.xml
+<?xml version="1.0" encoding="UTF-8"?>
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <groupId>com.secoo.search</groupId>
+  <artifactId>search-model-data</artifactId>
+  <version>1.0-SNAPSHOT</version>
+
+  <name>search-model-data</name>
+
+  <properties>
+    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    <maven.compiler.source>1.7</maven.compiler.source>
+    <maven.compiler.target>1.7</maven.compiler.target>
+  </properties>
+
+  <dependencies>
+    
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <version>4.11</version>
+      <scope>test</scope>
+    </dependency>
+
+
+
+
+    <!-- hadoop -->
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-common</artifactId>
+      <version>2.7.1</version>
+
+      <exclusions>
+        <exclusion>
+          <groupId>org.apache.httpcomponents</groupId>
+          <artifactId>httpclient</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>commons-httpclient</groupId>
+          <artifactId>commons-httpclient</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.httpcomponents</groupId>
+          <artifactId>httpcore</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-client</artifactId>
+      <version>2.7.1</version>
+
+      <exclusions>
+        <exclusion>
+          <groupId>org.apache.hadoop</groupId>
+          <artifactId>hadoop-yarn-api</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+
+    <dependency>
+      <groupId>org.projectlombok</groupId>
+      <artifactId>lombok</artifactId>
+      <version>1.16.6</version>
+      <scope>compile</scope>
+    </dependency>
+
+
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-lang3</artifactId>
+      <version>3.5</version>
+    </dependency>
+
+
+
+
+
+
+    <!-- https://mvnrepository.com/artifact/com.alibaba/fastjson -->
+    <dependency>
+      <groupId>com.alibaba</groupId>
+      <artifactId>fastjson</artifactId>
+      <version>1.2.58</version>
+    </dependency>
+
+
+
+    <!-- https://mvnrepository.com/artifact/com.sparkjava/spark-core -->
+    <dependency>
+      <groupId>com.secoo.so.common</groupId>
+      <artifactId>common-data-api</artifactId>
+      <version>1.7.9</version>
+    </dependency>
+
+
+    <dependency>
+      <artifactId>sqp4j-client</artifactId>
+      <groupId>com.secoo</groupId>
+      <version>2.8.2.RELEASE</version>
+      <exclusions>
+        <exclusion>
+          <groupId>com.secoo.mall</groupId>
+          <artifactId>secoo-log</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+
+
+    <dependency>
+      <groupId>org.apache.parquet</groupId>
+      <artifactId>parquet-common</artifactId>
+      <version>1.11.0</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.parquet</groupId>
+      <artifactId>parquet-encoding</artifactId>
+      <version>1.11.0</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.parquet</groupId>
+      <artifactId>parquet-column</artifactId>
+      <version>1.11.0</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.parquet</groupId>
+      <artifactId>parquet-hadoop</artifactId>
+      <version>1.11.0</version>
+    </dependency>
+
+    <dependency>
+      <groupId>com.secoo.so.common</groupId>
+      <artifactId>common-data-api</artifactId>
+      <version>1.9.4</version>
+    </dependency>
+
+
+
+  </dependencies>
+
+
+
+  <build>
+    <resources>
+      <resource>
+        <directory>src/main/resources</directory>
+      </resource>
+    </resources>
+    <plugins>
+
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <version>3.6.0</version>
+        <configuration>
+          <source>1.7</source>
+          <target>1.7</target>
+          <encoding>UTF-8</encoding>
+        </configuration>
+      </plugin>
+
+
+      <!--<plugin>-->
+      <!--<groupId>org.apache.maven.plugins</groupId>-->
+      <!--<artifactId>maven-assembly-plugin</artifactId>-->
+      <!--<version>2.6</version>-->
+      <!--<configuration>-->
+      <!--<appendAssemblyId>false</appendAssemblyId>-->
+      <!--<descriptors>-->
+      <!--<descriptor>src/main/assembly/assembly.xml</descriptor>-->
+      <!--</descriptors>-->
+      <!--</configuration>-->
+      <!--<executions>-->
+      <!--<execution>-->
+      <!--<id>make-assembly</id>-->
+      <!--<phase>package</phase>-->
+      <!--<goals>-->
+      <!--<goal>single</goal>-->
+      <!--</goals>-->
+      <!--</execution>-->
+      <!--</executions>-->
+      <!--</plugin>-->
+
+      <!-- 打成 withDependencies jar 包-->
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-assembly-plugin</artifactId>
+        <version>2.6</version>
+        <configuration>
+          <descriptorRefs>
+            <descriptorRef>jar-with-dependencies</descriptorRef>
+          </descriptorRefs>
+        </configuration>
+        <executions>
+          <execution>
+            <id>make-assembly</id>
+            <phase>package</phase>
+            <goals>
+              <goal>single</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+
+    </plugins>
+  </build>
+
+
+</project>
--- a/src/main/assembly/assembly.xml
+++ b/src/main/assembly/assembly.xml
+<!--
+ - Copyright 1999-2011 Alibaba Group.
+ -
+ - Licensed under the Apache License, Version 2.0 (the "License");
+ - you may not use this file except in compliance with the License.
+ - You may obtain a copy of the License at
+ -
+ -      http://www.apache.org/licenses/LICENSE-2.0
+ -
+ - Unless required by applicable law or agreed to in writing, software
+ - distributed under the License is distributed on an "AS IS" BASIS,
+ - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ - See the License for the specific language governing permissions and
+ - limitations under the License.
+-->
+<assembly xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.2"
+          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+          xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.2
+          http://maven.apache.org/xsd/assembly-1.1.2.xsd">
+    <id>assembly</id>
+    <formats>
+        <format>tar.gz</format>
+    </formats>
+    <includeBaseDirectory>false</includeBaseDirectory>
+    <fileSets>
+        <fileSet>
+            <directory>src/main/assembly/scripts</directory>
+            <outputDirectory>bin</outputDirectory>
+            <fileMode>0755</fileMode>
+            <directoryMode>0755</directoryMode>
+        </fileSet>
+    </fileSets>
+
+    <dependencySets>
+        <dependencySet>
+            <outputDirectory>lib</outputDirectory>
+        </dependencySet>
+    </dependencySets>
+</assembly>
\ No newline at end of file
--- a/src/main/java/com/secoo/search/App.java
+++ b/src/main/java/com/secoo/search/App.java
+package com.secoo.search;
+
+import org.apache.commons.lang.StringUtils;
+
+public class App {
+
+
+    public static void main(String[] args) {
+        System.out.println(StringUtils.isNumeric("1111122"));
+    }
+}
--- a/src/main/java/com/secoo/search/job/keyword/KeywordFeatureExtractJob.java
+++ b/src/main/java/com/secoo/search/job/keyword/KeywordFeatureExtractJob.java
+package com.secoo.search.job.keyword;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+/**
+ * @author zhaoyanchao
+ */
+public class KeywordFeatureExtractJob extends Configured implements Tool {
+
+    private static final Logger LOG = LoggerFactory.getLogger(KeywordFeatureExtractJob.class);
+
+    public static void main(String[] args) {
+        try {
+            int ret = ToolRunner.run(new KeywordFeatureExtractJob(), args);
+
+            System.exit(ret);
+        } catch (Exception e) {
+            LOG.error(e.getMessage(), e);
+        }
+    }
+
+
+
+    @Override
+    public int run(String[] strings) throws Exception {
+
+        Job job = Job.getInstance();
+        Configuration conf = job.getConfiguration();
+        conf.set("mapreduce.job.maps", "30");
+        conf.set("mapreduce.job.reduces", "10");
+
+        job.setJobName("query-feature-extract-job");
+        job.setJarByClass(KeywordFeatureExtractJob.class);
+        job.setMapperClass(KeywordFeatureExtractMap.class);
+
+        job.setMapOutputKeyClass(Text.class);
+        job.setMapOutputValueClass(NullWritable.class);
+
+        job.setInputFormatClass(TextInputFormat.class);
+
+        FileInputFormat.addInputPath(job, new Path("hdfs://tesla-cluster/apps/hive/warehouse/secoo_search.db/search_data_original_query_last_year"));
+
+        job.setReducerClass(KeywordFeatureExtractReduce.class);
+
+        job.setOutputFormatClass(TextOutputFormat.class);
+        FileOutputFormat.setOutputPath(job, new Path("hdfs://tesla-cluster/apps/hive/warehouse/secoo_search.db/search_data_query_original_feature"));
+
+        boolean success = job.waitForCompletion(true);
+        if (!success) {
+            System.out.println("job failed");
+            return -1;
+        }
+        return 0;
+    }
+}
--- a/src/main/java/com/secoo/search/job/keyword/KeywordFeatureExtractMap.java
+++ b/src/main/java/com/secoo/search/job/keyword/KeywordFeatureExtractMap.java
+package com.secoo.search.job.keyword;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.io.MapWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+
+import java.io.IOException;
+
+/**
+ *
+ *  map 根据 keyword 作为key, 到 reduce 再处理
+ *  @author zhaoyanchao
+ */
+public class KeywordFeatureExtractMap extends Mapper<Object, Text, Text, NullWritable> {
+
+    private static final int COLS = 1;
+
+    @Override
+    protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
+
+        String line = String.valueOf(value);
+        String[] items = line.trim().split("\t");
+
+
+        if (items.length == COLS){
+            if (StringUtils.isNotBlank(items[0]) && !filter(items[0])){
+                context.write(new Text(items[0].trim()), NullWritable.get());
+            }
+        }
+    }
+
+    /**
+     * 过滤掉部分query 词,如纯数字
+     * @param item
+     * @return
+     */
+    private boolean filter(String item) {
+        if (StringUtils.isNumeric(item)) {
+            return true;
+        }
+        return false;
+    }
+
+
+}
--- a/src/main/java/com/secoo/search/job/keyword/KeywordFeatureExtractReduce.java
+++ b/src/main/java/com/secoo/search/job/keyword/KeywordFeatureExtractReduce.java
+package com.secoo.search.job.keyword;
+
+import com.secoo.so.common.constant.Environment;
+import com.secoo.so.common.query.NormalQueryPlanServiceImpl;
+import com.secoo.so.common.query.QueryFeature;
+import com.secoo.so.common.query.QueryPlanService;
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.io.MapWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Reducer;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * @author zhaoyanchao
+ */
+public class KeywordFeatureExtractReduce  extends Reducer<Text, NullWritable, NullWritable, Text> {
+
+
+    QueryPlanService planService;
+
+    private static final String FIELD_SPLITOR = "\t";
+    @Override
+    protected void setup(Context context) throws IOException, InterruptedException {
+        planService = new NormalQueryPlanServiceImpl(Environment.QUERY_PLAN_PROD);
+    }
+
+    @Override
+    protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
+        if (key == null )  {
+            return;
+        }
+
+        String keyword = key.toString();
+        if (StringUtils.isBlank(keyword)) {
+            return;
+        }
+
+        QueryFeature queryFeature = planService.extractQueryFeature(keyword);
+        String cateStr = getListStr(queryFeature.getCateIds(),5);
+        String brandStr = getListStr(queryFeature.getBrandIds(),3);
+        String record = new StringBuilder().append(keyword).append(FIELD_SPLITOR)
+                   .append(cateStr).append(FIELD_SPLITOR)
+                 .append(brandStr).append(FIELD_SPLITOR)
+                .append(queryFeature.getGender()).append(FIELD_SPLITOR)
+                .append(queryFeature.getContainsOtherWord()).append(FIELD_SPLITOR).toString();
+
+        context.write(NullWritable.get(),new Text(record.toString()));
+        // 避免对sqp 压力过大
+        Thread.sleep(5);
+    }
+
+    /** 输入list 可能为 null
+     *  可能为少于 5个的id
+     * */
+    private String getListStr(List<Long> oldCateIds,int num) {
+        List<Long> cateIds = new ArrayList<>();
+        if (oldCateIds == null) {
+            oldCateIds = Collections.emptyList();
+        }
+        for (int i = 0; i < num; ++i) {
+            if (i < oldCateIds.size() && oldCateIds.get(i) != null) {
+                cateIds.add(oldCateIds.get(i));
+            } else {
+                cateIds.add(0L);
+            }
+        }
+        return StringUtils.join(cateIds,"\t");
+
+    }
+
+}
--- a/src/main/java/com/secoo/search/testcommondata/TestQueryPlanInvoke.java
+++ b/src/main/java/com/secoo/search/testcommondata/TestQueryPlanInvoke.java
+package com.secoo.search.testcommondata;
+
+import com.secoo.so.common.constant.Environment;
+import com.secoo.so.common.query.NormalQueryPlanServiceImpl;
+import com.secoo.so.common.query.QueryPlanService;
+
+public class TestQueryPlanInvoke {
+
+    public static void main(String[] args) {
+
+
+        QueryPlanService queryPlanService = new NormalQueryPlanServiceImpl(Environment.QUERY_PLAN_PROD);
+        queryPlanService.explain("lv 女包");
+    }
+}
--- a/src/main/scripts/query_feature/query_feature_extract.sh
+++ b/src/main/scripts/query_feature/query_feature_extract.sh
+# 创建keyword 原始数据
+hive -e "create external table if not exists secoo_search.search_data_original_query_last_year
+(
+   keyword string comment '搜索词'
+) comment '过去一年的搜索词'
+row format delimited fields terminated by '\t'
+stored as textfile;"
+
+
+## 截取 p_day=xxxx-xx-xx 的最近时间
+recent_keyword_day=`hive -e "show partitions secoo_app.app_search_keyword_year_week_p_day" |tail -n 2 | head -n 1`
+
+recent_keyword_day=${recent_keyword_day:6:10}
+echo $recent_keyword_day
+
+
+# 提取keyword 到输入表
+hive -e "insert overwrite table secoo_search.search_data_original_query_last_year
+select M.keyword
+from 
+   (
+      select T.keyword,T.year_cnt
+      from  (
+               select 
+                  keyword,
+                  sum(year_pv) as year_cnt
+               from secoo_app.app_search_keyword_year_week_p_day
+               where p_day = '$recent_keyword_day' 
+               group by keyword
+            ) T 
+      order by T.year_cnt desc limit 100000
+   ) M;"
+
+# 删除原输出文件 
+hdfs dfs -rm -r hdfs://tesla-cluster/apps/hive/warehouse/secoo_search.db/search_data_query_original_feature
+
+# 提取特征到输出表
+yarn jar search-model-data-1.0-SNAPSHOT-jar-with-dependencies.jar com.secoo.search.job.keyword.KeywordFeatureExtractJob
+
+