一道hadoop面试题

这题是网上找的，如果做的不对，请大家指正。

1 使用Hive或者自定义MR实现如下逻辑
product_no    lac_id  moment  start_time    user_id county_id    staytime       city_id
13429100031    22554 8    2013-03-11 08:55:19.151754088 571    571    282    571
13429100082    22540 8    2013-03-11 08:58:20.152622488 571    571    270    571
13429100082    22691 8    2013-03-11 08:56:37.149593624 571    571    103    571
13429100087    22705 8    2013-03-11 08:56:51.139539816 571    571    220    571
13429100087    22540 8    2013-03-11 08:55:45.150276800 571    571    66    571
13429100082    22540 8    2013-03-11 08:55:38.140225200 571    571    133    571
13429100140    26642 9    2013-03-11 09:02:19.151754088 571    571    18    571
13429100082    22691 8    2013-03-11 08:57:32.151754088 571    571    287    571
13429100189    22558 8    2013-03-11 08:56:24.139539816 571    571    48    571
13429100349    22503 8    2013-03-11 08:54:30.152622440 571    571    211    571
字段解释：
product_no：用户手机号；
lac_id：用户所在基站；
start_time：用户在此基站的开始时间；
staytime：用户在此基站的逗留时间。

需求描述：
根据lac_id和start_time知道用户当时的位置，根据staytime知道用户各个基站的逗留时长。根据轨迹合并连续基站的staytime。
最终得到每一个用户按时间排序在每一个基站驻留时长

期望输出举例：
13429100082    22540 8    2013-03-11 08:58:20.152622488 571    571    270    571
13429100082    22691 8    2013-03-11 08:56:37.149593624 571    571    390    571
13429100082    22540 8    2013-03-11 08:55:38.140225200 571    571    133    571
13429100087    22705 8    2013-03-11 08:56:51.139539816 571    571    220    571
13429100087    22540 8    2013-03-11 08:55:45.150276800 571    571    66    571

说说我的思路：先按照TextInputFormat进行map，在map函数中再对每一行处理将手机号作为map的outputkey，行内容为outputvalue。在reduce的是按照时间排序。

[mw_shl_code=java,true]package hadoop;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class HadoopTest1
{
public static String split = " +|\t";  //定义一个分隔符，空格和tab都可以

public static class MyComarator implements Comparator //由于不是按照整个字符串比较，所以实现一个Comparator接口，按时间来比较
{
@Override
public int compare(Object o1, Object o2)
{
// TODO Auto-generated method stub
String str1 = (String)o1;
String str2 = (String)o2;

String []arr1 = str1.split(split);
String []arr2 = str2.split(split);

return (arr1[3] + arr1[4]).compareTo((arr2[3] + arr2[4]));
}
}

public static class MyMapper extends Mapper<LongWritable, Text, Text, Text>
{
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
{
if (key.equals(new LongWritable(0)))  //过滤掉第一行
{
return;
}
String line = value.toString();
String[] elements = line.split(split);
context.write(new Text(elements[0]), value);
}
}
public static class MyReducer extends Reducer<Text, Text, NullWritable, Text>
{
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException
{
List<String>list = new ArrayList<String>();

for (Text v : values)
{
list.add(v.toString());
}

list.sort(new MyComarator());
Collections.reverse(list);

for (int i =0; i < list.size(); ++i)
{
context.write(NullWritable.get(), new Text(list.get(i)));
}
}
}

public static void main(String[] args)
{
String HDFS_PATH = "hdfs://master:9000";
String INPUT_PATH = "/home/hadoop/hadoop-data/20150721/input";
String OUTT_PATH  = "/home/hadoop/hadoop-data/20150721/output";

try
{
FileSystem fs = FileSystem.get(new URI(HDFS_PATH), new Configuration());
FSDataOutputStream out = fs.create(new Path(HDFS_PATH + INPUT_PATH + "/text"));
String text = "product_no lac_id  moment          start_time       user_id  county_id  staytime  city_id\n"
+ "13429100031    22554 8    2013-03-11 08:55:19.151754088 571    571    282    571\n"
+ "13429100082    22540 8    2013-03-11 08:58:20.152622488 571    571    270    571\n"
+ "13429100082    22691 8    2013-03-11 08:56:37.149593624 571    571    103    571\n"
+ "13429100087    22705 8    2013-03-11 08:56:51.139539816 571    571    220    571\n"
+ "13429100087    22540 8    2013-03-11 08:55:45.150276800 571    571    66    571\n"
+ "13429100082    22540 8    2013-03-11 08:55:38.140225200 571    571    133    571\n"
+ "13429100140    26642 9    2013-03-11 09:02:19.151754088 571    571    18    571\n"
+ "13429100082    22691 8    2013-03-11 08:57:32.151754088 571    571    287    571\n"
+ "13429100189    22558 8    2013-03-11 08:56:24.139539816 571    571    48    571\n"
+ "13429100349    22503 8    2013-03-11 08:54:30.152622440 571    571    211    571";
out.write(text.getBytes());
out.close();

Job job = new Job(new Configuration(), "HadoopTest1");
job.setJarByClass(HadoopTest1.class);

job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);

job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);

job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);

if (fs.exists(new Path(HDFS_PATH + OUTT_PATH))) //删除已有的输出文件
{
fs.delete(new Path(HDFS_PATH + OUTT_PATH), true);
}

TextInputFormat.addInputPath(job, new Path(HDFS_PATH + INPUT_PATH));
FileOutputFormat.setOutputPath(job, new Path(HDFS_PATH + OUTT_PATH));

job.waitForCompletion(true);

}
catch (URISyntaxException e)
{
e.printStackTrace();
}
catch (IOException e)
{
e.printStackTrace();
}
catch (ClassNotFoundException e)
{
e.printStackTrace();
}
catch (InterruptedException e)
{
e.printStackTrace();
}
}
}
[/mw_shl_code]

最后的输出结果：
[mw_shl_code=bash,true]13429100031    22554 8    2013-03-11 08:55:19.151754088 571    571    282    571
13429100082    22540 8    2013-03-11 08:58:20.152622488 571    571    270    571
13429100082    22691 8    2013-03-11 08:57:32.151754088 571    571    287    571
13429100082    22691 8    2013-03-11 08:56:37.149593624 571    571    103    571
13429100082    22540 8    2013-03-11 08:55:38.140225200 571    571    133    571
13429100087    22705 8    2013-03-11 08:56:51.139539816 571    571    220    571
13429100087    22540 8    2013-03-11 08:55:45.150276800 571    571    66    571
13429100140    26642 9    2013-03-11 09:02:19.151754088 571    571    18    571
13429100189    22558 8    2013-03-11 08:56:24.139539816 571    571    48    571
13429100349    22503 8    2013-03-11 08:54:30.152622440 571    571    211    571
[/mw_shl_code]