guangzhou



shanghai

Recent posts:
Blog index
About
RSS

MapReduce数据筛选程序

June 03, 2015     Hadoop   683   

原创文章,转载请注明: 转载自工学1号馆

有一批路由日志,需要提取MAC地址和时间,删除其他内容。

日志内容格式如下:
Apr 15 10:04:42 hostapd: wlan0: STA 14:7D:C5:9E:84
Apr 15 10:04:43 hostapd: wlan0: STA 14:7D:C5:9E:85
Apr 15 10:04:44 hostapd: wlan0: STA 14:7D:C5:9E:86
Apr 15 10:04:45 hostapd: wlan0: STA 14:7D:C5:9E:87
Apr 15 10:04:46 hostapd: wlan0: STA 14:7D:C5:9E:88
Apr 15 10:04:47 hostapd: wlan0: STA 14:7D:C5:9E:89
Apr 15 10:04:48 hostapd: wlan0: STA 14:7D:C5:9E:14
Apr 15 10:04:49 hostapd: wlan0: STA 14:7D:C5:9E:24
Apr 15 10:04:52 hostapd: wlan0: STA 14:7D:C5:9E:34
Apr 15 10:04:32 hostapd: wlan0: STA 14:7D:C5:9E:44
Apr 15 10:04:22 hostapd: wlan0: STA 14:7D:C5:9E:54

筛选后的内容格式为:
Apr 15 10:04:42 14:7D:C5:9E:84
Apr 15 10:04:43 14:7D:C5:9E:85
Apr 15 10:04:44 14:7D:C5:9E:86
Apr 15 10:04:45 14:7D:C5:9E:87
Apr 15 10:04:46 14:7D:C5:9E:88
Apr 15 10:04:47 14:7D:C5:9E:89
Apr 15 10:04:48 14:7D:C5:9E:14
Apr 15 10:04:49 14:7D:C5:9E:24
Apr 15 10:04:52 14:7D:C5:9E:34
Apr 15 10:04:32 14:7D:C5:9E:44
Apr 15 10:04:22 14:7D:C5:9E:54

算法思路

源文件—>Mapper(分割原始数据、输出所需数据、处理异常数据)—>输出到HDFS

代码如下:

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class printmac extends Configured implements Tool{
	enum Counter{
		LINESKIP,//记录出错的行
	}
/**
*Mapper<LongWritable,Text,NullWritable,Text>
*LongWritable,Text 是输入数据的key和value 如:路由日志的每一行的首字符的偏移量作为key,整一行的内容作为value
*NullWritable,Text 是输出数据的key和value
*
*/
public static class RouterMapper extends Mapper<LongWritable,Text,NullWritable,Text>{

	//map(LongWritable key,Text value,Context context)
	//LongWritable key,Text value,和RouterMapper类的输入数据的key、value对应
	//Context 上下文环境
	public void map(LongWritable key,Text value,Context context)throws IOException,InterruptedException{
		String line = value.toString();
		try{
		
			String[] lineSplit = line.split(" ");//分割原始数据
			String month = lineSplit[0];//获取月份
			String day = lineSplit[1];//获取日期
			String time = lineSplit[2];//获取时间
			String mac = lineSplit[6];//获取网卡地址
			
			//转换成hadoop能读取的输出格式,要和RouterMapper类的输出数据格式一致
			Text out = new Text(month+" "+day+" "+time+" "+mac);
			
			//输出
			context.write(NullWritable.get(), out);
		
		}catch(ArrayIndexOutOfBoundsException e) {
			//对异常数据进行处理,出现异常,令计数器+1
			context.getCounter(Counter.LINESKIP).increment(1);
			return;
		}
	}
}

@Override
public int run(String[] arg0) throws Exception {
	Configuration conf = getConf();
	Job job = new Job(conf,"printmac");//指定任务名称
	job.setJarByClass(printmac.class);//指定Class
	FileInputFormat.addInputPath(job, new Path(arg0[0]));//输入路径
	FileOutputFormat.setOutputPath(job, new Path(arg0[1]));//输出路径
	
	job.setMapperClass(RouterMapper.class);//调用RouterMapper类作为Mapper的任务代码
	job.setOutputFormatClass(TextOutputFormat.class);
	
	job.setOutputKeyClass(NullWritable.class);//指定输出的key格式,要和RouterMapper的输出数据格式一致
	job.setOutputValueClass(Text.class);//指定输出的value格式,要和RouterMapper的输出数据格式一致
	job.waitForCompletion(true);
	return job.isSuccessful()?0:1;
}

	//测试用的main方法
	//main方法运行的时候需要指定输入路径和输出路径
	public static void main(String[] args) throws Exception{
		int res = ToolRunner.run(new Configuration(), new printmac(), args);
		System.exit(res);
	
	}
}

 

如果文章对您有帮助,欢迎点击下方按钮打赏作者

Comments

No comments yet.
To verify that you are human, please fill in "七"(required)