Cloudera – Hive Mini lab with Impala data validation

Login to your Cloudera Client with WinSCP and launch putty from there

username/password: cloudera/cloudera

# There is an mini lab file for practice purpose:

# check the file content

# Create a directory in HDFS and copy the file from local storage to HDFS

#Verify the file has been copied into HDFS

Note: Refer to the Apache Hive documentation for Hive related DDL/Operations before start
https://hive.apache.org/

# start "hive" and create an temporary table and use the Hive’s SerDes (serializers/ De-serializers) to parse the logs into individual fields with regular expression

CREATE EXTERNAL TABLE temp_access_logs (
ip STRING,
date STRING,
method STRING,
url STRING,
http_version STRING,
code1 STRING,
code2 STRING,
dash STRING,
user_agent STRING)
ROW FORMAT SERDE ‘org.apache.hadoop.hive.contrib.serde2.RegexSerDe’
WITH SERDEPROPERTIES (
‘input.regex’ = ‘([^ ]*) – – \[([^\]]*)\] "([^ ]*) ([^ ]*) ([^ ]*)" (\d*) (\d*) "([^"]*)" "([^"]*)"’,
‘output.format.string’ = "%1$$s %2$$s %3$$s %4$$s %5$$s %6$$s %7$$s %8$$s %9$$s")
LOCATION ‘/user/hive/warehouse/test_access_logs’;

# Create a finalized external hive table

CREATE EXTERNAL TABLE finalized_access_logs (
ip STRING,
date STRING,
method STRING,
url STRING,
http_version STRING,
code1 STRING,
code2 STRING,
dash STRING,
user_agent STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘,’
LOCATION ‘/user/hive/warehouse/finalized_access_logs’;

# Add below JAR resource
ADD JAR /usr/lib/hive/lib/hive-contrib.jar;

# insert the data from the temp table into the finalized table

# Exit Hive and use the Impala-shell to validate the data

# Total 18,000 records

Related Posts