Login to your Cloudera Client with WinSCP and launch putty from there
username/password: cloudera/cloudera
# There is an mini lab file for practice purpose:
# check the file content
# Create a directory in HDFS and copy the file from local storage to HDFS
#Verify the file has been copied into HDFS
Note: Refer to the Apache Hive documentation for Hive related DDL/Operations before start
https://hive.apache.org/
# start "hive" and create an temporary table and use the Hive’s SerDes (serializers/ De-serializers) to parse the logs into individual fields with regular expression
CREATE EXTERNAL TABLE temp_access_logs (
ip STRING,
date STRING,
method STRING,
url STRING,
http_version STRING,
code1 STRING,
code2 STRING,
dash STRING,
user_agent STRING)
ROW FORMAT SERDE ‘org.apache.hadoop.hive.contrib.serde2.RegexSerDe’
WITH SERDEPROPERTIES (
‘input.regex’ = ‘([^ ]*) – – \[([^\]]*)\] "([^ ]*) ([^ ]*) ([^ ]*)" (\d*) (\d*) "([^"]*)" "([^"]*)"’,
‘output.format.string’ = "%1$$s %2$$s %3$$s %4$$s %5$$s %6$$s %7$$s %8$$s %9$$s")
LOCATION ‘/user/hive/warehouse/test_access_logs’;
# Create a finalized external hive table
CREATE EXTERNAL TABLE finalized_access_logs (
ip STRING,
date STRING,
method STRING,
url STRING,
http_version STRING,
code1 STRING,
code2 STRING,
dash STRING,
user_agent STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘,’
LOCATION ‘/user/hive/warehouse/finalized_access_logs’;
# Add below JAR resource
ADD JAR /usr/lib/hive/lib/hive-contrib.jar;
# insert the data from the temp table into the finalized table
# Exit Hive and use the Impala-shell to validate the data