Wesukilaye

Your choices please me, for now


  • 首页

  • 关于

  • 标签

  • 分类

  • 归档

  • 搜索

大数据学习 Day8

发表于 2019-08-01 更新于 2019-08-06 分类于 大数据
本文字数: 5.3k 阅读时长 ≈ 5 分钟

对之前爬虫爬到的智联招聘的数据进行mapreduce处理,再进行数据分析
上传招聘数据到分布式文件系统

hdfs dfs –put /opt/zl0507.csv /

在eclipse中新建package,重命名为com.sj.clean

其中新建类 CleanMapper.java

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
package com.sj.clean;

import java.io.IOException;

import org.apache.hadoop.classification.InterfaceAudience.Private;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class CleanMapper extends Mapper{
@Override
protected void map(LongWritable key, Text value, Mapper.Context context)
throws IOException, InterruptedException {
//将第一行过滤掉,第一行偏移量为0
if(!key.toString().equals("0")){
String[] arr=value.toString().split(",");
//用于一行数据,分割成7个字符串的数组
String[] cacheString=new String[7];
//存储第七个字段由于数据量太大,可能出现英文逗号(分隔符),从而产生错误,所以加了一个cachestring来进行拼接
StringBuffer stringBuffer =new StringBuffer();
for (int i=0;i
if(i<6){
cacheString[i]=arr[i];

}else if(i>=6){
stringBuffer.append(arr[i]);
}
}
cacheString[6]=stringBuffer.toString();
if(check(cacheString)){
String data=String.join("|",cacheString);
context.write(NullWritable.get(), new Text(data));
}
}
// TODO Auto-generated method stub

}
private boolean check(String [] arr){
for(String string : arr){
if(string.trim().equals("")){
return false;
}
}
return true;
}

}

新建类 CleanReducer.java

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
package com.sj.clean;

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class CleanReducer extends Reducer{
@Override
protected void reduce(NullWritable arg0, Iterable values,
Reducer.Context context) throws IOException, InterruptedException {
// TODO Auto-generated method stub
for(Text text:values){
String [] datalist=text.toString().split("\\|");
if(datalist.length==7){
String data=String.join("|", datalist);
context.write(NullWritable.get(),new Text(data));
}
}
}
}

新建类 Main.java

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
package com.sj.clean;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class Main {
public static void main(String[] args) throws Exception{
Configuration configuration=new Configuration();
Job job=Job.getInstance(configuration);
job.setJarByClass(Main.class);
job.setMapperClass(CleanMapper.class);
job.setMapOutputKeyClass(NullWritable.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(CleanReducer.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job, new Path("hdfs://192.168.245.132:9000/zl0507.csv"));
FileOutputFormat.setOutputPath(job,new Path("hdfs://192.168.245.132:9000/clean"));
Boolean res=job.waitForCompletion(true);
System.exit(res?0:-1);


}

}

运行Main.java,在分布式文件系统/clean/ 中生成文件

利用hive进行词频统计:

常用命令:
  • create database if not exists hive;
  • show databases;
  • show databases like ‘h.*’;
  • use hive;
  • show tables;
  • create table if not exists hive.userr(name string comment’username’,pwd string comment ‘password’, address structstreet:string,city:string,state:string,zip:int,identify map comment’number,sex’);
统计:

master:

1
2
cd /opt
vi wc.txt

写入:

1
2
3
4
hongyutang love qiaoshuang
xiaoyu is running
china is great country
beijing is the capital of china

启动hive

1
2
3
4
create table wc(txt String) row format delimited fields terminated by '\t';
load data local inpath '/opt/wc.txt' overwrite into table wc;
select * from wc;
select split(txt,' ') from wc;

结果:

1
2
3
4
["hongyutang","love","qiaoshuang"]
["xiaoyu","is","running"]
["china","is","great","country"]
["beijing","is","the","capital","of","china"]

select explode(split(txt,' ')) from wc;

结果:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
hongyutang
love
qiaoshuang
xiaoyu
is
running
china
is
great
country
beijing
is
the
capital
of
china

select t1.word,count(t1.word) from (select explode(split(txt ,' ')) word from wc)t1 group by t1.word;

结果:

1
2
3
4
5
6
7
8
9
10
11
12
13
beijing 1
capital 1
china 2
country 1
great 1
hongyutang 1
is 3
love 1
of 1
qiaoshuang 1
running 1
the 1
xiaoyu 1
  • 本文作者: Mr.Zhao
  • 本文链接: https://wesukilayezcy.github.io/2019/08/01/大数据学习-Day8/
  • 版权声明: 本博客所有文章除特别声明外,均采用 BY-NC-SA 许可协议。转载请注明出处!
大数据
大数据学习 Day7
Swift OC与Swift大致区别
-------------本文结束感谢您的阅读-------------
  • 文章目录
  • 站点概览
Wesukilaye

Wesukilaye

熟练使用iOS Objective-c,Swift. 了解Python爬取网络数据,深入研究移动端开发,目前正在学习Flutter
23 日志
4 分类
16 标签
RSS
GitHub E-Mail bilibili
Links
  • Jacksu
  1. 1. 常用命令:
    1. 1.1. 统计:
© 2019 Wesukilaye | 62k | 57 分钟
由 Hexo 强力驱动 v3.9.0
|
主题 – NexT.Pisces v7.3.0