目录
实验目的:给定一份英文文本,统计每个字符在文本中出现的频率
完成时间:2024-4-22
一、前提准备工作
启动hadoop集群
ssh localhost
cd /usr/local/hadoop
./sbin/start-dfs.sh
二、实验过程
1.虚拟机安装先设置端口转发
2.上传对应文件
ls
3.编写Java应用程序
xhost local:gedit
export DISPLAY=:0
xhost local:gedit
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class CharacterCount {
// Mapper 类,处理输入文件的每一行,并将字符逐个传递给 Reducer
public static class CharMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
// map 方法将输入的每一行文本拆分为字符,并将每个字符写入上下文
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// 将输入行转换为小写以实现不区分大小写
String line = value.toString().toLowerCase();
for (int i = 0; i < line.length(); i++) {
char c = line.charAt(i);
// 检查字符是否为字母或数字,如果是,则将其写入上下文进行统计
if (Character.isLetter(c) || Character.isDigit(c)) {
context.write(new Text(String.valueOf(c)), one);
}
}
}
}
// Reducer 类,接收来自 Mapper 的字符统计数据并进行合并
public static class CharReducer
extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();
// reduce 方法将相同字符的统计数据合并为总数,并写入输出上下文
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
// 主函数,设置作业的配置信息,并运行 MapReduce 任务
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf, "character count");
job.setJarByClass(CharacterCount.class);
job.setMapperClass(CharMapper.class);
job.setReducerClass(CharReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0])); // 设置输入路径
FileOutputFormat.setOutputPath(job, new Path(args[1])); // 设置输出路径
System.exit(job.waitForCompletion(true) ? 0 : 1); // 运行作业并等待完成
}
}
4. 编译打包程序
javac -classpath `/usr/local/hadoop/bin/hadoop classpath` CharacterCount.java
jar cf CharacterCount.jar *.class
5. 运行程序
cd /usr/local/hadoop
./bin/hdfs dfs -rm -r input
./bin/hdfs dfs -rm -r output
cd /usr/local/hadoop
./bin/hdfs dfs -mkdir input
cd /usr/local/hadoop
./bin/hdfs dfs -put /tmp/1.txt input
cd /usr/local/hadoop
./bin/hdfs dfs -rm -r /user/hadoop/output
cd ~
/usr/local/hadoop/bin/hadoop jar CharacterCount.jar CharacterCount input output
cd /usr/local/hadoop
./bin/hdfs dfs -cat output/*