
Windows10Ubuntu Kylin 16.04Java8Hadoop-2.7.1Hive1.2.2IDEA 2020.2.3Pycharm 2021.1.3Eclipse3.8通过MapReduce对最值、排序、TopN、自定义分区排序、二次排序、自定义类、占比等8个方面的统计分析
二手房房价的最值是体现一个城市经济的重要因素,也是顾客购买的衡量因素之一。
Driver端:
public class MaxMinTotalPriceByCityDriver {public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {Configuration conf = new Configuration();Job job = Job.getInstance(conf, "MaxMinTotalPriceByCity");job.setJarByClass(MaxMinTotalPriceByCityDriver.class);job.setMapperClass(MaxMinTotalPriceByCityMapper.class);job.setReducerClass(MaxMinTotalPriceByCityReducer.class);job.setMapOutputKeyClass(Text.class);job.setMapOutputValueClass(IntWritable.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(Text.class);FileInputFormat.setInputPaths(job, new Path("datas/tb_house.txt"));FileOutputFormat.setOutputPath(job, new Path("MapReduce/out/MaxMinTotalPriceByCity"));job.waitForCompletion(true);}
}
public class MaxMinTotalPriceByCityMapper extends Mapper
Reducer端:
public class MaxMinTotalPriceByCityReducer extends Reducer {@Overrideprotected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {List totalList = new ArrayList();Iterator iterator = values.iterator();while (iterator.hasNext()) {totalList.add(iterator.next().get());}Collections.sort(totalList);int max = totalList.get(totalList.size() - 1);int min = totalList.get(0);Text outv = new Text();outv.set("房子总价最大、小值分别为:" + String.valueOf(max) + "万元," + String.valueOf(min) + "万元");context.write(key, outv);}
}
运行情况:

结果:

二手房的数量是了解房子基本情况的维度之一,数量的多少在一定程度上体现了房子的受欢迎度。

Driver端:
public class HouseCntByCityDriver {public static void main(String[] args) throws Exception {args = new String[] { "/input/datas/tb_house.txt", "/output/HouseCntByCity" };Configuration conf = new Configuration();conf.set("fs.defaultFS", "hdfs://node01:9000");Job job = Job.getInstance(conf, "HouseCntByCity");job.setJarByClass(HouseCntByCityDriver.class);job.setMapperClass(HouseCntByCityMapper.class);job.setReducerClass(HouseCntByCityReducer.class);job.setMapOutputKeyClass(Text.class);job.setMapOutputValueClass(IntWritable.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(IntWritable.class);job.setPartitionerClass(CityPartitioner.class);job.setNumReduceTasks(4);FileInputFormat.addInputPath(job, new Path(args[0]));FileOutputFormat.setOutputPath(job, new Path(args[1]));job.waitForCompletion(true);}
}
Mapper端:
public class HouseCntByCityMapper extends Mapper
Reducer端:
public class HouseCntByCityReducer extends Reducer {@Overrideprotected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {int sum = 0;for (IntWritable val : values) sum += val.get();context.write(key, new IntWritable(sum));}
}


二手房的信息发布时间是了解房子基本情况的维度之一,在一定程度上,顾客倾向于最新的房源信息。
Driver端:
public class AcessHousePubTimeSortDriver {public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {Configuration conf = new Configuration()Job job = Job.getInstance(conf, "AcessHousePubTimeSort");job.setJarByClass(AcessHousePubTimeSortDriver.class);job.setMapperClass(AcessHousePubTimeSortMapper.class);job.setReducerClass(AcessHousePubTimeSortReducer.class);job.setMapOutputKeyClass(Text.class);job.setMapOutputValueClass(IntWritable.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(IntWritable.class);FileInputFormat.setInputPaths(job, new Path("datas/tb_house.txt"));FileOutputFormat.setOutputPath(job, new Path("MapReduce/out/AcessHousePubTimeSort"));job.waitForCompletion(true);}
}
Mapper端:
public class AcessHousePubTimeSortMapper extends Mapper
Reducer端:
public class AcessHousePubTimeSortReducer extends Reducer {@Overrideprotected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {int sum = 0;for (IntWritable val : values) sum += val.get();context.write(key, new IntWritable(sum));}
}


TopN是MapReduce分析最常见且必不可少的一个例子。
Driver端:
public class TotalPriceTop5ByCityDriver {public static void main(String[] args) throws Exception {args = new String[] { "datas/tb_house.txt", "MapReduce/out/TotalPriceTop5ByCity" };Configuration conf = new Configuration();if (args.length != 2) {System.err.println("Usage: TotalPriceTop5ByCity ");System.exit(2);}Job job = Job.getInstance(conf);job.setJarByClass(TotalPriceTop5ByCityDriver.class);job.setMapperClass(TotalPriceTop5ByCityMapper.class);job.setReducerClass(TotalPriceTop5ByCityReducer.class);job.setMapOutputKeyClass(Text.class);job.setMapOutputValueClass(IntWritable.class);job.setNumReduceTasks(1);job.setOutputKeyClass(Text.class);job.setOutputValueClass(Text.class);FileInputFormat.addInputPath(job, new Path(args[0]));FileOutputFormat.setOutputPath(job, new Path(args[1]));System.exit(job.waitForCompletion(true) ? 0 : 1);}
}
Mapper端:
public class TotalPriceTop5ByCityMapper extends Mapper
Reducer端:
public class TotalPriceTop5ByCityReducer extends Reducer {private Text outv = new Text();private int len = 0;@Overrideprotected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {List totalPriceList = new ArrayList();Iterator iterator = values.iterator();while (iterator.hasNext()) {totalPriceList.add(iterator.next().get());}Collections.sort(totalPriceList);int size = totalPriceList.size();String top5Str = "二手房总价Top5:";for (int i = 1; i <= 5; i++) {if (i == 5) {top5Str += totalPriceList.get(size - i) + "万元";} else {top5Str += totalPriceList.get(size - i) + "万元, ";}}outv.set(String.valueOf(top5Str));context.write(key, outv);}
}


自定义分区全排序可以实现不同于以往的排序方式,展示效果与默认全排序可以体现出一定的差别。
public class TotalOrderingPartition extends Configured implements Tool {static class SimpleMapper extends Mapper


…

…

…

某些时候按照一个字段的排序方式并不能让我们满意,二次排则是解决这个问题的一个方法。
Driver端:

Mapper端:

Reducer端:



某些字段通过MapReduce不可以直接统计得到,这时采用自定义类的方式便可以做到。
自定义类:
public class HouseCntByPositionTopListBean implements Writable {private Text info;private IntWritable cnt;public Text getInfo() {return info;}public void setInfo(Text info) {this.info = info;}public IntWritable getCnt() {return cnt;}public void setCnt(IntWritable cnt) {this.cnt = cnt;}@Overridepublic void readFields(DataInput in) throws IOException {this.cnt = new IntWritable(in.readInt());}@Overridepublic void write(DataOutput out) throws IOException {out.writeInt(cnt.get());}@Overridepublic String toString() {String infoStr = info.toString();int idx = infoStr.indexOf("-");String city = infoStr.substring(0, idx);String position = infoStr.substring(idx + 1);return city + "#" + "[" + position + "]" + "#" + cnt;}
}
Driver端:

Mapper端:

Reducer端:





占比分析同样是MapReduce统计分析的一大常用方式。
Driver端:
public class TagRatioByCityDriver {public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {args = new String[] {"datas/tb_house.txt", "MapReduce/out/TagRatioByCity" };Configuration conf = new Configuration();Job job = Job.getInstance(conf);job.setJarByClass(TagRatioByCityDriver.class);job.setMapOutputKeyClass(Text.class);job.setMapOutputValueClass(IntWritable.class);job.setOutputKeyClass(Text.class);job.setOutputValueClass(Text.class);job.setMapperClass(TagRatioByCityMapper.class);job.setReducerClass(TagRatioByCityReducer.class);FileInputFormat.addInputPath(job, new Path(args[0]));FileOutputFormat.setOutputPath(job, new Path(args[1]));job.waitForCompletion(true);}
}
Mapper端:
public class TagRatioByCityMapper extends Mapper
Reducer端:
public class TagRatioByCityReducer extends Reducer {private Text outv = new Text();private int sum = 0;@Overrideprotected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {DecimalFormat df = new DecimalFormat("0.00");int cnt = 0;for (IntWritable value : values) {cnt += value.get();}String s = key.toString();String format = "";if (s.contains("上海")) {sum = 2995;format = df.format((double) cnt / sum * 100) + "%";} else if (s.contains("北京")) {sum = 2972;format = df.format((double) cnt / sum * 100) + "%";} else if (s.contains("广州")) {sum = 2699;format = df.format((double) cnt / sum * 100) + "%";} else {sum = 2982;format = df.format((double) cnt / sum * 100) + "%";}outv.set(format);context.write(key, outv);}
}

tp
Github
Gitee
MapReduce统计分析过程需要比较细心,「根据二手房信息发布时间排序统计」这个涉及到Java中日期类SimpleDateFormat和Date的使用,需要慢慢调试得出结果;统计最值和占比的难度并不高,主要在于统计要计算的类别的数量和总数量,最后二者相处即可;二次排序和自定义类难度较高,但一步一步来还是可以实现的。
结束!

上一篇:图形驱动软件栈
下一篇:关于IO流的基础理论