大数据学习 Day8

对之前爬虫爬到的智联招聘的数据进行mapreduce处理，再进行数据分析
上传招聘数据到分布式文件系统

hdfs dfs –put /opt/zl0507.csv /

在eclipse中新建package，重命名为com.sj.clean

其中新建类 CleanMapper.java

package com.sj.clean;

import java.io.IOException;

import org.apache.hadoop.classification.InterfaceAudience.Private;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class CleanMapper extends Mapper{
    @Override
    protected void map(LongWritable key, Text value, Mapper.Context context)
            throws IOException, InterruptedException {
        //将第一行过滤掉,第一行偏移量为0
        if(!key.toString().equals("0")){
            String[] arr=value.toString().split(",");
            //用于一行数据，分割成7个字符串的数组
            String[] cacheString=new String[7];
            //存储第七个字段由于数据量太大，可能出现英文逗号（分隔符），从而产生错误，所以加了一个cachestring来进行拼接
            StringBuffer stringBuffer =new StringBuffer();
            for (int i=0;i
                if(i<6){
                    cacheString[i]=arr[i];

                }else if(i>=6){
                    stringBuffer.append(arr[i]);
                }
            }
            cacheString[6]=stringBuffer.toString();
            if(check(cacheString)){
                String data=String.join("|",cacheString);
                context.write(NullWritable.get(), new Text(data));
            }
        }
        // TODO Auto-generated method stub

    }
    private boolean check(String [] arr){
        for(String string : arr){
            if(string.trim().equals("")){
                return false;
            }
        }
        return true;
    }

}

新建类 CleanReducer.java

package com.sj.clean;

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class CleanReducer extends Reducer{
    @Override
    protected void reduce(NullWritable arg0, Iterable values,
            Reducer.Context context) throws IOException, InterruptedException {
        // TODO Auto-generated method stub
        for(Text text:values){
            String [] datalist=text.toString().split("\\|");
            if(datalist.length==7){
                String data=String.join("|", datalist);
                context.write(NullWritable.get(),new Text(data));
            }
        }
    }
}

新建类 Main.java

package com.sj.clean;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class Main {
    public static void main(String[] args) throws Exception{
        Configuration configuration=new Configuration();
        Job job=Job.getInstance(configuration);
        job.setJarByClass(Main.class);
        job.setMapperClass(CleanMapper.class);
        job.setMapOutputKeyClass(NullWritable.class);
        job.setMapOutputValueClass(Text.class);
        job.setReducerClass(CleanReducer.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(NullWritable.class);
        FileInputFormat.setInputPaths(job, new Path("hdfs://192.168.245.132:9000/zl0507.csv"));
        FileOutputFormat.setOutputPath(job,new Path("hdfs://192.168.245.132:9000/clean"));
        Boolean res=job.waitForCompletion(true);
        System.exit(res?0:-1);


    }

}

运行Main.java，在分布式文件系统/clean/ 中生成文件

利用hive进行词频统计：

常用命令：

create database if not exists hive;
show databases;
show databases like ‘h.*’;
use hive;
show tables;
create table if not exists hive.userr(name string comment’username’,pwd string comment ‘password’, address structstreet:string,city:string,state:string,zip:int,identify map comment’number,sex’);

统计：

master：

1 2	cd /opt vi wc.txt

写入：

hongyutang love qiaoshuang
xiaoyu is running
china is great country
beijing is the capital of china

启动hive

create table wc(txt String) row format delimited fields terminated by '\t';
load data local inpath '/opt/wc.txt' overwrite into table wc;
select * from wc;
select split(txt,' ') from wc;

结果：

["hongyutang","love","qiaoshuang"]
["xiaoyu","is","running"]
["china","is","great","country"]
["beijing","is","the","capital","of","china"]

select explode(split(txt,' ')) from wc;

结果:

hongyutang
love
qiaoshuang
xiaoyu
is
running
china
is
great
country
beijing
is
the
capital
of
china

select t1.word,count(t1.word) from (select explode(split(txt ,' ')) word from wc)t1 group by t1.word;

结果：

beijing 1
capital 1
china   2
country 1
great   1
hongyutang      1
is      3
love    1
of      1
qiaoshuang      1
running 1
the     1
xiaoyu  1