Hadoop入门案例WordCount

最新推荐文章于 2025-08-18 17:45:33 发布

码喵喵

最新推荐文章于 2025-08-18 17:45:33 发布

阅读量986

点赞数 1

CC 4.0 BY-SA版权

文章标签： hadoop mapreduce 大数据

本文链接：https://blog.csdn.net/weixin_58414150/article/details/128015994

wordcount可以说是hadoop的入门案例，也是基础案例

主要体现思想就是mapreduce核心思想

原始文件为hadoop.txt，内容如下：

hello,java
hello,java,linux,hadoop
hadoop,java,linux
hello,java,linux
linux,c,java
c,php,java

在整个文件中单词所出现的次数

Hadoop思维：

Mapreduce -----》

Map:

Mapper类-------》获取一个切片数据

map()-------》获取一个切片中的一行数据

{把一行数据切分成若干个单词---》[k,1]}

shuffle过程，会把key值相同聚合<k,[v1,v2,v3,v4]>,根据key进行默认排序，根据key%numR分配到不同的reduce上

Reduce:

Reducer类------》获取map传入reduce的数据

reduce()------》处理一个key的数据

{遍历values值，累加，求value个数-------》写出 [k,vNum]}

目录结构：

代码：MapReduce典型的填空式编程

package org.shixun;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class WordCount {
    /**
     * 执行当前类的主方法
     * */
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        //1、获取当前hadoop环境
        Configuration configuration = new Configuration();
        //2、设置提交的job
        Job job = Job.getInstance();
        //3、配置执行的主类
        job.setJarByClass(WordCount.class);
        //4、配置Mapper类
        job.setMapperClass(MyMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);
        //5、配置Reducer类
        job.setReducerClass(MyReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);
        //6、设置输入路径  默认输入类型 -- <LongWritable,Text>  <偏移量，文件信息>
        FileInputFormat.addInputPath(job,new Path("D:\\code\\shixun\\mr\\input\\hadoop.txt"));
        //7、设置输出路径
        FileOutputFormat.setOutputPath(job,new Path("D:\\code\\shixun\\mr\\output\\out2"));
        //8、执行job
        System.out.println(job.waitForCompletion(true) ? "success":"failed");
    }
    /**
     * MapReduce核心类Mapper类
     * */
    public static class MyMapper extends Mapper<LongWritable, Text,Text,LongWritable>{//根据分析，类型指定完了 处理一个切片的内容
        private LongWritable longWritable = new LongWritable(1);
        //处理一行的内容
        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context) throws IOException, InterruptedException {
            //获取一行的数据
            String line = value.toString();
            //对数据经行变形处理
            String[] splits = line.split(",");
            for (String split : splits) {
                context.write(new Text(split),longWritable); //[hello,1]
            }
        }
    }
    /**
     * shuffle:
     *        合并key------value [v1,v2,v3,v4]
     *        key.hashcode()%reduceNum -----0
     *        自然排序
     * */
    public static class MyReducer extends Reducer<Text,LongWritable,Text,LongWritable>{
        @Override
        protected void reduce(Text key, Iterable<LongWritable> values, Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException {
            long nums=0;
            for (LongWritable value : values) {
                nums+=value.get(); //value----1
            }
            context.write(key,new LongWritable(nums));
        }
    }
}

依赖pom.xml:

<?xml version="1.0" encoding="UTF-8"?>

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>

  <groupId>org.shixun</groupId>
  <artifactId>Hadoop</artifactId>
  <version>1.0-SNAPSHOT</version>

  <name>Hadoop</name>
  <!-- FIXME change it to the project's website -->
  <url>http://www.example.com</url>

  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <maven.compiler.source>1.8</maven.compiler.source>
    <maven.compiler.target>1.8</maven.compiler.target>
  </properties>

  <dependencies>
    <!--导入Hadoop相关的依赖-->
    <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-common</artifactId>
      <version>2.7.3</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-client</artifactId>
      <version>2.7.3</version>
      <scope>provided</scope>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-hdfs -->
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-hdfs</artifactId>
      <version>2.7.3</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-core -->
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-mapreduce-client-core</artifactId>
      <version>2.7.3</version>
    </dependency>

    <!--添加的-->
    <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-common -->
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-mapreduce-client-common</artifactId>
      <version>2.7.3</version>
    </dependency>
    <!--日志配置-->
    <dependency>
      <groupId>log4j</groupId>
      <artifactId>log4j</artifactId>
      <version>1.2.17</version>
    </dependency>
  </dependencies>

  <build>
    <pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
      <plugins>
        <!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
        <plugin>
          <artifactId>maven-clean-plugin</artifactId>
          <version>3.1.0</version>
        </plugin>
        <!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
        <plugin>
          <artifactId>maven-resources-plugin</artifactId>
          <version>3.0.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-compiler-plugin</artifactId>
          <version>3.8.0</version>
        </plugin>
        <plugin>
          <artifactId>maven-surefire-plugin</artifactId>
          <version>2.22.1</version>
        </plugin>
        <plugin>
          <artifactId>maven-jar-plugin</artifactId>
          <version>3.0.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-install-plugin</artifactId>
          <version>2.5.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-deploy-plugin</artifactId>
          <version>2.8.2</version>
        </plugin>
        <!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
        <plugin>
          <artifactId>maven-site-plugin</artifactId>
          <version>3.7.1</version>
        </plugin>
        <!--<plugin>
          <artifactId>maven-project-info-reports-plugin</artifactId>
          <version>3.0.0</version>
        </plugin>-->
      </plugins>
    </pluginManagement>
  </build>
</project>

运行输出文件 part-r-00000 结果：