目录
实验目的:给定一份英文文本,统计每个字符在文本中出现的频率
完成时间:2024-4-22
一、前提准备工作
启动hadoop集群
ssh localhost
cd /usr/local/hadoop
./sbin/start-dfs.sh

二、实验过程

1.虚拟机安装先设置端口转发

2.上传对应文件

ls
3.编写Java应用程序

xhost local:gedit
export DISPLAY=:0
xhost local:gedit
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class CharacterCount {
  // Mapper 类,处理输入文件的每一行,并将字符逐个传递给 Reducer
  public static class CharMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    private final static IntWritable one = new IntWritable(1);
    // map 方法将输入的每一行文本拆分为字符,并将每个字符写入上下文
    public void map(LongWritable key, Text value, Context context)
      throws IOException, InterruptedException {
      // 将输入行转换为小写以实现不区分大小写
      String line = value.toString().toLowerCase();
      for (int i = 0; i < line.length(); i++) {
        char c = line.charAt(i);
        // 检查字符是否为字母或数字,如果是,则将其写入上下文进行统计
        if (Character.isLetter(c) || Character.isDigit(c)) {
          context.write(new Text(String.valueOf(c)), one);
        }
      }
    }
  }
  // Reducer 类,接收来自 Mapper 的字符统计数据并进行合并
  public static class CharReducer
    extends Reducer<Text, IntWritable, Text, IntWritable> {
    private IntWritable result = new IntWritable();
    // reduce 方法将相同字符的统计数据合并为总数,并写入输出上下文
    public void reduce(Text key, Iterable<IntWritable> values, Context context)
      throws IOException, InterruptedException {
      int sum = 0;
      for (IntWritable val : values) {
        sum += val.get();
      }
      result.set(sum);
      context.write(key, result);
    }
  }
  // 主函数,设置作业的配置信息,并运行 MapReduce 任务
  public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = new Job(conf, "character count");
    job.setJarByClass(CharacterCount.class);
    job.setMapperClass(CharMapper.class);
    job.setReducerClass(CharReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(args[0])); // 设置输入路径
    FileOutputFormat.setOutputPath(job, new Path(args[1])); // 设置输出路径
    System.exit(job.waitForCompletion(true) ? 0 : 1); // 运行作业并等待完成
  }
}

4. 编译打包程序
 javac -classpath `/usr/local/hadoop/bin/hadoop classpath` CharacterCount.java 
jar cf CharacterCount.jar *.class
5. 运行程序
cd /usr/local/hadoop
./bin/hdfs dfs -rm -r input
./bin/hdfs dfs -rm -r output
cd /usr/local/hadoop
./bin/hdfs dfs -mkdir input
cd /usr/local/hadoop
./bin/hdfs dfs -put /tmp/1.txt input
cd /usr/local/hadoop
./bin/hdfs dfs -rm -r /user/hadoop/outputcd ~
/usr/local/hadoop/bin/hadoop jar CharacterCount.jar CharacterCount input output

cd /usr/local/hadoop
./bin/hdfs dfs -cat output/*










