Hadoop 中利用 mapreduce 读写 mysql 数据-CFANZ编程社区

问题导读

1.hadoop mapreduce的通过哪两个类可以读取数据源？

2.如果没有mysql驱动包，一般会是什么问题？

3.如何添加包？

Hadoop 中利用 mapreduce 读写 mysql 数据_hadoop

有时候我们在项目中会遇到输入结果集很大，但是输出结果很小，比如一些 pv、uv 数据，然后为了实时查询的需求，或者一些 OLAP 的需求，我们需要 mapreduce 与 mysql 进行数据的交互，而这些特性正是 hbase 或者 hive 目前亟待改进的地方。

好了言归正传，简单的说说背景、原理以及需要注意的地方：

1、为了方便 MapReduce 直接访问关系型数据库（Mysql,Oracle），Hadoop提供了DBInputFormat和DBOutputFormat两个类。通过DBInputFormat类把数据库表数据读入到HDFS，根据DBOutputFormat类把MapReduce产生的结果集导入到数据库表中。

2、由于0.20版本对DBInputFormat和DBOutputFormat支持不是很好，该例用了0.19版本来说明这两个类的用法。

至少在我的 0.20.203 中的 org.apache.hadoop.mapreduce.lib 下是没见到 db 包，所以本文也是以老版的 API 来为例说明的。

3、运行MapReduce时候报错：java.io.IOException: com.mysql.jdbc.Driver，一般是由于程序找不到mysql驱动包。解决方法是让每个tasktracker运行MapReduce程序时都可以找到该驱动包。

添加包有两种方式：

（1）在每个节点下的${HADOOP_HOME}/lib下添加该包。重启集群，一般是比较原始的方法。

（2）a)把包传到集群上： hadoop fs -put mysql-connector-java-5.1.0- bin.jar /hdfsPath/

b)在mr程序提交job前，添加语句：DistributedCache.addFileToClassPath(new Path(“/hdfsPath/mysql- connector-java- 5.1.0-bin.jar”), conf);

（3）虽然API用的是0.19的，但是使用0.20的API一样可用，只是会提示方法已过时而已。、

4、测试数据：

1. CREATE TABLE `t` (
2. `id` int DEFAULT NULL,
3. `name` varchar(10) DEFAULT NULL
4. ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
5. 
6. CREATE TABLE `t2` (
7. `id` int DEFAULT NULL,
8. `name` varchar(10) DEFAULT NULL
9. ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
10. 
11. insert into t values (1,"june"),(2,"decli"),(3,"hello"),
12.         (4,"june"),(5,"decli"),(6,"hello"),(7,"june"),
13.         (8,"decli"),(9,"hello"),(10,"june"),
14.         (11,"june"),(12,"decli"),(13,"hello");

复制代码

5、代码：

1. import java.io.DataInput;
2. import java.io.DataOutput;
3. import java.io.IOException;
4. import java.sql.PreparedStatement;
5. import java.sql.ResultSet;
6. import java.sql.SQLException;
7. import java.util.Iterator;
8. 
9. import org.apache.hadoop.filecache.DistributedCache;
10. import org.apache.hadoop.fs.Path;
11. import org.apache.hadoop.io.LongWritable;
12. import org.apache.hadoop.io.Text;
13. import org.apache.hadoop.io.Writable;
14. import org.apache.hadoop.mapred.JobClient;
15. import org.apache.hadoop.mapred.JobConf;
16. import org.apache.hadoop.mapred.MapReduceBase;
17. import org.apache.hadoop.mapred.Mapper;
18. import org.apache.hadoop.mapred.OutputCollector;
19. import org.apache.hadoop.mapred.Reducer;
20. import org.apache.hadoop.mapred.Reporter;
21. import org.apache.hadoop.mapred.lib.IdentityReducer;
22. import org.apache.hadoop.mapred.lib.db.DBConfiguration;
23. import org.apache.hadoop.mapred.lib.db.DBInputFormat;
24. import org.apache.hadoop.mapred.lib.db.DBOutputFormat;
25. import org.apache.hadoop.mapred.lib.db.DBWritable;
26. 
27. /**
28. * Function: 测试 mr 与 mysql 的数据交互，此测试用例将一个表中的数据复制到另一张表中
29. *                          实际当中，可能只需要从 mysql 读，或者写到 mysql 中。
30. * date: 2013-7-29 上午2:34:04 <br/>
31. * @author june
32. */
33. public class Mysql2Mr {

34.         // DROP TABLE IF EXISTS `hadoop`.`studentinfo`;
35.         // CREATE TABLE studentinfo (
36.         // id INTEGER NOT NULL PRIMARY KEY,
37.         // name VARCHAR(32) NOT NULL);
38. 
39.         public static class StudentinfoRecord implements Writable, DBWritable {

40.                 int id;
41.                 String name;
42. 
43.                 public StudentinfoRecord() {

44. 
45.                 }
46. 
47.                 public void readFields(DataInput in) throws IOException {

48.                         this.id = in.readInt();
49.                         this.name = Text.readString(in);
50.                 }
51. 
52.                 public String toString() {

53.                         return new String(this.id + " " + this.name);
54.                 }
55. 
56.                 @Override
57.                 public void write(PreparedStatement stmt) throws SQLException {

58.                         stmt.setInt(1, this.id);
59.                         stmt.setString(2, this.name);
60.                 }
61. 
62.                 @Override
63.                 public void readFields(ResultSet result) throws SQLException {

64.                         this.id = result.getInt(1);
65.                         this.name = result.getString(2);
66.                 }
67. 
68.                 @Override
69.                 public void write(DataOutput out) throws IOException {

70.                         out.writeInt(this.id);
71.                         Text.writeString(out, this.name);
72.                 }
73.         }
74. 
75.         // 记住此处是静态内部类，要不然你自己实现无参构造器，或者等着抛异常：
76.         // Caused by: java.lang.NoSuchMethodException: DBInputMapper.<init>()
77.         // http://stackoverflow.com/questions/7154125/custom-mapreduce-input-format-cant-find-constructor
78.         // 网上脑残式的转帖，没见到一个写对的。。。
79.         public static class DBInputMapper extends MapReduceBase implements
80.                         Mapper<LongWritable, StudentinfoRecord, LongWritable, Text> {

81.                 public void map(LongWritable key, StudentinfoRecord value,
82.                                 OutputCollector<LongWritable, Text> collector, Reporter reporter) throws IOException {

83.                         collector.collect(new LongWritable(value.id), new Text(value.toString()));
84.                 }
85.         }
86. 
87.         public static class MyReducer extends MapReduceBase implements
88.                         Reducer<LongWritable, Text, StudentinfoRecord, Text> {

89.                 @Override
90.                 public void reduce(LongWritable key, Iterator<Text> values,
91.                                 OutputCollector<StudentinfoRecord, Text> output, Reporter reporter) throws IOException {

92.                         String[] splits = values.next().toString().split(" ");
93.                         StudentinfoRecord r = new StudentinfoRecord();
94.                         r.id = Integer.parseInt(splits[0]);
95.                         r.name = splits[1];
96.                         output.collect(r, new Text(r.name));
97.                 }
98.         }
99. 
100.         public static void main(String[] args) throws IOException {

101.                 JobConf conf = new JobConf(Mysql2Mr.class);
102.                 DistributedCache.addFileToClassPath(new Path("/tmp/mysql-connector-java-5.0.8-bin.jar"), conf);
103. 
104.                 conf.setMapOutputKeyClass(LongWritable.class);
105.                 conf.setMapOutputValueClass(Text.class);
106.                 conf.setOutputKeyClass(LongWritable.class);
107.                 conf.setOutputValueClass(Text.class);
108. 
109.                 conf.setOutputFormat(DBOutputFormat.class);
110.                 conf.setInputFormat(DBInputFormat.class);
111.                 // // mysql to hdfs
112.                 // conf.setReducerClass(IdentityReducer.class);
113.                 // Path outPath = new Path("/tmp/1");
114.                 // FileSystem.get(conf).delete(outPath, true);
115.                 // FileOutputFormat.setOutputPath(conf, outPath);
116. 
117.                 DBConfiguration.configureDB(conf, "com.mysql.jdbc.Driver", "jdbc:mysql://192.168.1.101:3306/test",
118.                                 "root", "root");
119.                 String[] fields = { "id", "name" };
120.                 // 从 t 表读数据
121.                 DBInputFormat.setInput(conf, StudentinfoRecord.class, "t", null, "id", fields);
122.                 // mapreduce 将数据输出到 t2 表
123.                 DBOutputFormat.setOutput(conf, "t2", "id", "name");
124.                 // conf.setMapperClass(org.apache.hadoop.mapred.lib.IdentityMapper.class);
125.                 conf.setMapperClass(DBInputMapper.class);
126.                 conf.setReducerClass(MyReducer.class);
127. 
128.                 JobClient.runJob(conf);
129.         }
130. }

复制代码

6、结果：

执行两次后，你可以看到mysql结果：

1. mysql> select * from t2;
2. +------+-------+
3. | id   | name  |
4. +------+-------+
5. |    1 | june  |
6. |    2 | decli |
7. |    3 | hello |
8. |    4 | june  |
9. |    5 | decli |
10. |    6 | hello |
11. |    7 | june  |
12. |    8 | decli |
13. |    9 | hello |
14. |   10 | june  |
15. |   11 | june  |
16. |   12 | decli |
17. |   13 | hello |
18. |    1 | june  |
19. |    2 | decli |
20. |    3 | hello |
21. |    4 | june  |
22. |    5 | decli |
23. |    6 | hello |
24. |    7 | june  |
25. |    8 | decli |
26. |    9 | hello |
27. |   10 | june  |
28. |   11 | june  |
29. |   12 | decli |
30. |   13 | hello |
31. +------+-------+
32. 26 rows in set (0.00 sec)
33. 
34. mysql>

复制代码

7、日志：

1. 13/07/29 02:33:03 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.
2. 13/07/29 02:33:03 INFO filecache.TrackerDistributedCacheManager: Creating mysql-connector-java-5.0.8-bin.jar in /tmp/hadoop-june/mapred/local/archive/-8943686319031389138_-1232673160_640840668/192.168.1.101/tmp-work--8372797484204470322 with rwxr-xr-x
3. 13/07/29 02:33:03 INFO filecache.TrackerDistributedCacheManager: Cached hdfs://192.168.1.101:9000/tmp/mysql-connector-java-5.0.8-bin.jar as /tmp/hadoop-june/mapred/local/archive/-8943686319031389138_-1232673160_640840668/192.168.1.101/tmp/mysql-connector-java-5.0.8-bin.jar
4. 13/07/29 02:33:03 INFO filecache.TrackerDistributedCacheManager: Cached hdfs://192.168.1.101:9000/tmp/mysql-connector-java-5.0.8-bin.jar as /tmp/hadoop-june/mapred/local/archive/-8943686319031389138_-1232673160_640840668/192.168.1.101/tmp/mysql-connector-java-5.0.8-bin.jar
5. 13/07/29 02:33:03 INFO mapred.JobClient: Running job: job_local_0001
6. 13/07/29 02:33:03 INFO mapred.MapTask: numReduceTasks: 1
7. 13/07/29 02:33:03 INFO mapred.MapTask: io.sort.mb = 100
8. 13/07/29 02:33:03 INFO mapred.MapTask: data buffer = 79691776/99614720
9. 13/07/29 02:33:03 INFO mapred.MapTask: record buffer = 262144/327680
10. 13/07/29 02:33:03 INFO mapred.MapTask: Starting flush of map output
11. 13/07/29 02:33:03 INFO mapred.MapTask: Finished spill 0
12. 13/07/29 02:33:03 INFO mapred.Task: Task:attempt_local_0001_m_000000_0 is done. And is in the process of commiting
13. 13/07/29 02:33:04 INFO mapred.JobClient:  map 0% reduce 0%
14. 13/07/29 02:33:06 INFO mapred.LocalJobRunner: 
15. 13/07/29 02:33:06 INFO mapred.Task: Task 'attempt_local_0001_m_000000_0' done.
16. 13/07/29 02:33:06 INFO mapred.LocalJobRunner: 
17. 13/07/29 02:33:06 INFO mapred.Merger: Merging 1 sorted segments
18. 13/07/29 02:33:06 INFO mapred.Merger: Down to the last merge-pass, with 1 segments left of total size: 235 bytes
19. 13/07/29 02:33:06 INFO mapred.LocalJobRunner: 
20. 13/07/29 02:33:06 INFO mapred.Task: Task:attempt_local_0001_r_000000_0 is done. And is in the process of commiting
21. 13/07/29 02:33:07 INFO mapred.JobClient:  map 100% reduce 0%
22. 13/07/29 02:33:09 INFO mapred.LocalJobRunner: reduce > reduce
23. 13/07/29 02:33:09 INFO mapred.Task: Task 'attempt_local_0001_r_000000_0' done.
24. 13/07/29 02:33:09 WARN mapred.FileOutputCommitter: Output path is null in cleanup
25. 13/07/29 02:33:10 INFO mapred.JobClient:  map 100% reduce 100%
26. 13/07/29 02:33:10 INFO mapred.JobClient: Job complete: job_local_0001
27. 13/07/29 02:33:10 INFO mapred.JobClient: Counters: 18
28. 13/07/29 02:33:10 INFO mapred.JobClient:   File Input Format Counters 
29. 13/07/29 02:33:10 INFO mapred.JobClient:     Bytes Read=0
30. 13/07/29 02:33:10 INFO mapred.JobClient:   File Output Format Counters 
31. 13/07/29 02:33:10 INFO mapred.JobClient:     Bytes Written=0
32. 13/07/29 02:33:10 INFO mapred.JobClient:   FileSystemCounters
33. 13/07/29 02:33:10 INFO mapred.JobClient:     FILE_BYTES_READ=1211691
34. 13/07/29 02:33:10 INFO mapred.JobClient:     HDFS_BYTES_READ=1081704
35. 13/07/29 02:33:10 INFO mapred.JobClient:     FILE_BYTES_WRITTEN=2392844
36. 13/07/29 02:33:10 INFO mapred.JobClient:   Map-Reduce Framework
37. 13/07/29 02:33:10 INFO mapred.JobClient:     Map output materialized bytes=239
38. 13/07/29 02:33:10 INFO mapred.JobClient:     Map input records=13
39. 13/07/29 02:33:10 INFO mapred.JobClient:     Reduce shuffle bytes=0
40. 13/07/29 02:33:10 INFO mapred.JobClient:     Spilled Records=26
41. 13/07/29 02:33:10 INFO mapred.JobClient:     Map output bytes=207
42. 13/07/29 02:33:10 INFO mapred.JobClient:     Map input bytes=13
43. 13/07/29 02:33:10 INFO mapred.JobClient:     SPLIT_RAW_BYTES=75
44. 13/07/29 02:33:10 INFO mapred.JobClient:     Combine input records=0
45. 13/07/29 02:33:10 INFO mapred.JobClient:     Reduce input records=13
46. 13/07/29 02:33:10 INFO mapred.JobClient:     Reduce input groups=13
47. 13/07/29 02:33:10 INFO mapred.JobClient:     Combine output records=0
48. 13/07/29 02:33:10 INFO mapred.JobClient:     Reduce output records=13
49. 13/07/29 02:33:10 INFO mapred.JobClient:     Map output records=13

复制代码

Hadoop 中利用 mapreduce 读写 mysql 数据_Hadoop_02

MapReduce直接连接Mysql获取数据

Mysql中数据：

1. mysql> select * from lxw_tbls;
2. +---------------------+----------------+
3. | TBL_NAME            | TBL_TYPE       |
4. +---------------------+----------------+
5. | lxw_test_table      | EXTERNAL_TABLE |
6. | lxw_t               | MANAGED_TABLE  |
7. | lxw_t1              | MANAGED_TABLE  |
8. | tt                  | MANAGED_TABLE  |
9. | tab_partition       | MANAGED_TABLE  |
10. | lxw_hbase_table_1   | MANAGED_TABLE  |
11. | lxw_hbase_user_info | MANAGED_TABLE  |
12. | t                   | EXTERNAL_TABLE |
13. | lxw_jobid           | MANAGED_TABLE  |
14. +---------------------+----------------+
15. 9 rows in set (0.01 sec)
16. 
17. mysql> select * from lxw_tbls where TBL_NAME like 'lxw%' order by TBL_NAME;
18. +---------------------+----------------+
19. | TBL_NAME            | TBL_TYPE       |
20. +---------------------+----------------+
21. | lxw_hbase_table_1   | MANAGED_TABLE  |
22. | lxw_hbase_user_info | MANAGED_TABLE  |
23. | lxw_jobid           | MANAGED_TABLE  |
24. | lxw_t               | MANAGED_TABLE  |
25. | lxw_t1              | MANAGED_TABLE  |
26. | lxw_test_table      | EXTERNAL_TABLE |
27. +---------------------+----------------+
28. 6 rows in set (0.00 sec)

复制代码

MapReduce程序代码，ConnMysql.java:

1. package com.lxw.study;
2. 
3. import java.io.DataInput;
4. import java.io.DataOutput;
5. import java.io.IOException;
6. import java.net.URI;
7. import java.sql.PreparedStatement;
8. import java.sql.ResultSet;
9. import java.sql.SQLException;
10. import java.util.Iterator;
11. 
12. import org.apache.hadoop.conf.Configuration;
13. import org.apache.hadoop.filecache.DistributedCache;
14. import org.apache.hadoop.fs.FileSystem;
15. import org.apache.hadoop.fs.Path;
16. import org.apache.hadoop.io.LongWritable;
17. import org.apache.hadoop.io.Text;
18. import org.apache.hadoop.io.Writable;
19. import org.apache.hadoop.mapreduce.Job;
20. import org.apache.hadoop.mapreduce.Mapper;
21. import org.apache.hadoop.mapreduce.Reducer;
22. import org.apache.hadoop.mapreduce.lib.db.DBConfiguration;
23. import org.apache.hadoop.mapreduce.lib.db.DBInputFormat;
24. import org.apache.hadoop.mapreduce.lib.db.DBWritable;
25. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
26. 
27. public class ConnMysql {

28.         
29.         private static Configuration conf = new Configuration();
30.         
31.         static {

32.                 conf.addResource(new Path("F:/lxw-hadoop/hdfs-site.xml"));
33.                 conf.addResource(new Path("F:/lxw-hadoop/mapred-site.xml"));
34.                 conf.addResource(new Path("F:/lxw-hadoop/core-site.xml"));
35.                 conf.set("mapred.job.tracker", "10.133.103.21:50021");
36.         }
37.         
38.         public static class TblsRecord implements Writable, DBWritable {

39.                 String tbl_name;
40.                 String tbl_type;
41. 
42.                 public TblsRecord() {

43. 
44.                 }
45. 
46.                 @Override
47.                 public void write(PreparedStatement statement) throws SQLException {

48.                         // TODO Auto-generated method stub
49.                         statement.setString(1, this.tbl_name);
50.                         statement.setString(2, this.tbl_type);
51.                 }
52. 
53.                 @Override
54.                 public void readFields(ResultSet resultSet) throws SQLException {

55.                         // TODO Auto-generated method stub
56.                         this.tbl_name = resultSet.getString(1);
57.                         this.tbl_type = resultSet.getString(2);
58.                 }
59. 
60.                 @Override
61.                 public void write(DataOutput out) throws IOException {

62.                         // TODO Auto-generated method stub
63.                         Text.writeString(out, this.tbl_name);
64.                         Text.writeString(out, this.tbl_type);
65.                 }
66. 
67.                 @Override
68.                 public void readFields(DataInput in) throws IOException {

69.                         // TODO Auto-generated method stub
70.                         this.tbl_name = Text.readString(in);
71.                         this.tbl_type = Text.readString(in);
72.                 }
73. 
74.                 public String toString() {

75.                         return new String(this.tbl_name + " " + this.tbl_type);
76.                 }
77. 
78.         }
79. 
80.         public static class ConnMysqlMapper extends Mapper<LongWritable,TblsRecord,Text,Text> {

81.                 public void map(LongWritable key,TblsRecord values,Context context) 
82.                                 throws IOException,InterruptedException {

83.                         context.write(new Text(values.tbl_name), new Text(values.tbl_type));
84.                 }
85.         }
86.         
87.         public static class ConnMysqlReducer extends Reducer<Text,Text,Text,Text> {

88.                 public void reduce(Text key,Iterable<Text> values,Context context) 
89.                                 throws IOException,InterruptedException {

90.                         for(Iterator<Text> itr = values.iterator();itr.hasNext();) {

91.                                 context.write(key, itr.next());
92.                         }
93.                 }
94.         }
95.         
96.         public static void main(String[] args) throws Exception {

97.                 Path output = new Path("/user/lxw/output/");
98.                 
99.                 FileSystem fs = FileSystem.get(URI.create(output.toString()), conf);
100.                 if (fs.exists(output)) {

101.                         fs.delete(output);
102.                 }
103.                 
104.                 //mysql的jdbc驱动
105.                 DistributedCache.addFileToClassPath(new Path(  
106.                           "hdfs://hd022-test.nh.sdo.com/user/liuxiaowen/mysql-connector-java-5.1.13-bin.jar"), conf);  
107.                 
108.                 DBConfiguration.configureDB(conf, "com.mysql.jdbc.Driver",  
109.                           "jdbc:mysql://10.133.103.22:3306/hive", "hive", "hive");  
110.                 
111.                 Job job = new Job(conf,"test mysql connection");
112.                 job.setJarByClass(ConnMysql.class);
113.                 
114.                 job.setMapperClass(ConnMysqlMapper.class);
115.                 job.setReducerClass(ConnMysqlReducer.class);
116.                 
117.                 job.setOutputKeyClass(Text.class);
118.                 job.setOutputValueClass(Text.class);
119.                 
120.                 job.setInputFormatClass(DBInputFormat.class);
121.                 FileOutputFormat.setOutputPath(job, output);
122.                 
123.                 //列名
124.                 String[] fields = { "TBL_NAME", "TBL_TYPE" }; 
125.                 //六个参数分别为：
126.                 //1.Job;2.Class<? extends DBWritable>
127.                 //3.表名;4.where条件
128.                 //5.order by语句;6.列名
129.                 DBInputFormat.setInput(job, TblsRecord.class,
130.                      "lxw_tbls", "TBL_NAME like 'lxw%'", "TBL_NAME", fields);  
131.                 
132.                 System.exit(job.waitForCompletion(true) ? 0 : 1);
133.         }
134.         
135. }

复制代码

运行结果：

1. [lxw@hd025-test ~]$ hadoop fs -cat /user/lxw/output/part-r-00000
2. lxw_hbase_table_1       MANAGED_TABLE
3. lxw_hbase_user_info     MANAGED_TABLE
4. lxw_jobid       MANAGED_TABLE
5. lxw_t   MANAGED_TABLE
6. lxw_t1  MANAGED_TABLE
7. lxw_test_table  EXTERNAL_TABLE