OS:LINUX ubuntu server 9.10
事前要先架好subversion 叢集hadoop
1.首先使用svn update hive的資料 在用ant執行package
cd /opt
svn co http://svn.apache.org/repos/asf/hadoop/hive/trunk hive
cd hive
ant package
2.在/opt/hadoop/conf/hadoop-env.sh
加上export HIVE_HOME=/opt/hive
3.啟動hadoop
cd /opt/hadoop
本機執行 bin/hadoop namenode -format
bin/start-dfs.sh
bin/start-mapred.sh
bin/hadoop fs -mkdir /tmp
bin/hadoop fs -mkdir /user/hive/warehouse
bin/hadoop fs -chmod g+w /tmp
bin/hadoop fs -chmod g+w /user/hive/warehouse
4.啟動hive
cd /opt/hive
/bin/hive
ps如果跑出缺少*.jar檔 則要去下載
cd /opt
wget http://archive.cloudera.com/cdh/testing/hive-0.4.0+14.tar.gz
下載上面連結檔案然後解壓縮把裡面/lib/的jar檔都複製到/opt/hive/lib/下
tar -zxvf hive-0.4.0+14.tar.gz
mv hive-0.4.0+14 hive2
cp /opt/hive2/lib/*.jar /opt/hive/lib/
依/opt/bin/hive執行檔的內容 更改jar的檔名 依我的為例
cp hive_exec.jar hive-exec-.jar
cp hive_metastore.jar hive-metastore-.jar
cp hive_cli.jar hive-cli-.jar
就可以執行/opt/hive/bin/hive
5.成功執行hive會跑出hive>
然後就可以打指令(大小寫沒差)
a.創造TABLE
CREATE TABLE pokes (foo INT, bar STRING);
以下指令指出ds是一個virtual column
CREATE TABLE invites (foo INT, bar STRING) PARTITIONED BY (ds STRING);
b.show
SHOW TABLES;
SHOW TABLES '.*s'; <-秀出結尾為s的table
c.描述Table
DESCRIBE invites;
d.增加column及改名
ALTER TABLE pokes ADD COLUMNS (new_col INT);
ALTER TABLE invites ADD COLUMNS (new_col2 INT COMMENT 'a comment');
ALTER TABLE invites RENAME TO invite;
(後面還要用到先不要改名)
e.刪掉table
DROP TABLE pokes;
(後面還要用到先不要刪)
f.網頁觀看
可以在http://192.168.1.4:50070裡面找到喔
6.輸入data (編輯檔案可以開另一個terminal) 不然重登hive還要重create table 不過裡面的檔案都還在...怪怪的
cd /opt/hive
mkdir examples
cd exampes
mkdir files
cd files
gedit kv1.txt
內容 欄與欄用空白鍵隔開
123 string
12 asd
2 1234
輸入到pokes table
hive> LOAD DATA LOCAL INPATH '/opt/hive/examples/files/kv1.txt' OVERWRITE INTO TABLE pokes;
hive> LOAD DATA LOCAL INPATH '/opt/hive/examples/files/kv1.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2009-01-31');
hive> LOAD DATA LOCAL INPATH '/opt/hive/examples/files/kv1.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2009-01-31');
由上面指令可以看到virtual column ds的用處 可以另外輸入如時間等的標記 不過不能省略不輸入ds
OVERWRITE 是覆蓋舊檔 如果省略OVERWRITE 那麼就會累加進TABLES裡
7.使用SQL指令
由於是使用mapreduce的查詢 所以要設定 (可以參考/opt/hadoop/conf/hadoop-site.xml)
hive> SET mapred.job.tracker=192.168.1.4:9001;
hive> SET -v ; <-可以跑出目前的設定 可以依上面的方法去更改設定
hive> SET mapred.job.tracker; <-看單一設定
hive> SET fs.default.name=主機host名稱:9000;
hive> SELECT a.foo FROM invites a WHERE a.ds='<2010-1-31>';
上面這句會跑 不過沒有任何結果 因為沒有說要把結果存在那
hive> INSERT OVERWRITE DIRECTORY '/tmp/hdfs_out' SELECT a.* FROM invites a WHERE a.ds='<2010-1-31>';
hive> INSERT OVERWRITE LOCAL DIRECTORY '/tmp/local_out' SELECT a.* FROM pokes a;
上面第一句是說把結果存在hdfs://tmp/hdfs_out
第二句是說把結果存在本機 /tmp/local_out 為一般檔案
下面這句是把結果放在table裡
INSERT OVERWRITE TABLE events SELECT a.* FROM profiles a WHERE a.key < 100;
8.其它sql
group by
INSERT OVERWRITE TABLE events SELECT a.bar, count(1) FROM invites a WHERE a.foo > 0 GROUP BY a.bar;
FROM invites a INSERT OVERWRITE TABLE events SELECT a.bar, count(1) WHERE a.foo > 0 GROUP BY a.bar;
FROM可以移到前面
join
INSERT OVERWRITE TABLE events SELECT t1.bar, t1.foo, t2.foo FROM pokes t1 JOIN invites t2 ON (t1.bar = t2.bar);
streaming
FROM invites a INSERT OVERWRITE TABLE events SELECT TRANSFORM(a.foo, a.bar) AS (oof, rab) USING '/bin/cat' WHERE a.ds > '2008-08-09';
COUNT(1)
算row數
SELECT COUNT(1) FROM invites;
9.ps
CREATE TABLE u_data (
userid INT,
movieid INT,
rating INT,
unixtime STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t' <-可以指定斷欄的符號
STORED AS TEXTFILE;
wget http://www.grouplens.org/system/files/ml-data.tar__0.gz
tar -xvzf ml-data.tar__0.gz
LOAD DATA LOCAL INPATH 'ml-data/u.data' OVERWRITE INTO TABLE u_data;
把下載的資料解壓縮後注入u_data
創造一個檔案weekday_mapper.py 內容如下
import sys
import datetime
for line in sys.stdin:
line = line.strip()
userid, movieid, rating, unixtime = line.split('\t')
weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday()
print '\t'.join([userid, movieid, rating, str(weekday)])
然後下指令加入
add FILE weekday_mapper.py;
CREATE TABLE u_data_new (
userid INT,
movieid INT,
rating INT,
weekday INT)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t';
INSERT OVERWRITE TABLE u_data_new
SELECT
TRANSFORM (userid, movieid, rating, unixtime)
USING 'python weekday_mapper.py'
AS (userid, movieid, rating, weekday)
FROM u_data;
上面是說 TRANSFORM u_data的欄位AS u_data_new的欄位 使用weekday_mapper.py作轉換
將string的時間中的星期資料轉出來且形態為int
觀看
SELECT weekday, COUNT(1)
FROM u_data_new
GROUP BY weekday;
10.ps
weblog
add jar ../build/contrib/hive_contrib.jar;
CREATE TABLE apachelog (
host STRING,
identity STRING,
user STRING,
time STRING,
request STRING,
status STRING,
size STRING,
referer STRING,
agent STRING)
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
"input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*) (-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?",
"output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s"
)
STORED AS TEXTFILE;
留言列表