wordcount可以说是hadoop的入门案例,也是基础案例
主要体现思想就是mapreduce核心思想
原始文件为hadoop.txt,内容如下:
hello,java
hello,java,linux,hadoop
hadoop,java,linux
hello,java,linux
linux,c,java
c,php,java
在整个文件中单词所出现的次数
Hadoop思维:
Mapreduce -----》
Map:
Mapper类-------》获取一个切片数据
map()-------》获取一个切片中的一行数据
{把一行数据切分成若干个单词---》[k,1]}
shuffle过程,会把key值相同聚合<k,[v1,v2,v3,v4]>,根据key进行默认排序,根据key%numR分配到不同的reduce上
Reduce:
Reducer类------》获取map传入reduce的数据
reduce()------》处理一个key的数据
{遍历values值,累加,求value个数-------》写出 [k,vNum]}
目录结构:
代码:MapReduce典型的填空式编程
package org.shixun;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class WordCount {
/**
* 执行当前类的主方法
* */
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
//1、获取当前hadoop环境
Configuration configuration = new Configuration();
//2、设置提交的job
Job job = Job.getInstance();
//3、配置执行的主类
job.setJarByClass(WordCount.class);
//4、配置Mapper类
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
//5、配置Reducer类
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
//6、设置输入路径 默认输入类型 -- <LongWritable,Text> <偏移量,文件信息>
FileInputFormat.addInputPath(job,new Path("D:\\code\\shixun\\mr\\input\\hadoop.txt"));
//7、设置输出路径
FileOutputFormat.setOutputPath(job,new Path("D:\\code\\shixun\\mr\\output\\out2"));
//8、执行job
System.out.println(job.waitForCompletion(true) ? "success":"failed");
}
/**
* MapReduce核心类Mapper类
* */
public static class MyMapper extends Mapper<LongWritable, Text,Text,LongWritable>{//根据分析,类型指定完了 处理一个切片的内容
private LongWritable longWritable = new LongWritable(1);
//处理一行的内容
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context) throws IOException, InterruptedException {
//获取一行的数据
String line = value.toString();
//对数据经行变形处理
String[] splits = line.split(",");
for (String split : splits) {
context.write(new Text(split),longWritable); //[hello,1]
}
}
}
/**
* shuffle:
* 合并key------value [v1,v2,v3,v4]
* key.hashcode()%reduceNum -----0
* 自然排序
* */
public static class MyReducer extends Reducer<Text,LongWritable,Text,LongWritable>{
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException {
long nums=0;
for (LongWritable value : values) {
nums+=value.get(); //value----1
}
context.write(key,new LongWritable(nums));
}
}
}
依赖pom.xml:
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.shixun</groupId>
<artifactId>Hadoop</artifactId>
<version>1.0-SNAPSHOT</version>
<name>Hadoop</name>
<!-- FIXME change it to the project's website -->
<url>http://www.example.com</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
</properties>
<dependencies>
<!--导入Hadoop相关的依赖-->
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.3</version>
<scope>provided</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-hdfs -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-core -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>2.7.3</version>
</dependency>
<!--添加的-->
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-common -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-common</artifactId>
<version>2.7.3</version>
</dependency>
<!--日志配置-->
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
</dependency>
</dependencies>
<build>
<pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
<plugins>
<!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
<plugin>
<artifactId>maven-clean-plugin</artifactId>
<version>3.1.0</version>
</plugin>
<!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
<plugin>
<artifactId>maven-resources-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.0</version>
</plugin>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.22.1</version>
</plugin>
<plugin>
<artifactId>maven-jar-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-install-plugin</artifactId>
<version>2.5.2</version>
</plugin>
<plugin>
<artifactId>maven-deploy-plugin</artifactId>
<version>2.8.2</version>
</plugin>
<!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
<plugin>
<artifactId>maven-site-plugin</artifactId>
<version>3.7.1</version>
</plugin>
<!--<plugin>
<artifactId>maven-project-info-reports-plugin</artifactId>
<version>3.0.0</version>
</plugin>-->
</plugins>
</pluginManagement>
</build>
</project>
运行输出文件 part-r-00000 结果: