--Create Hive external Table for existing data
CREATE EXTERNAL TABLE logs_csv
(
date_time string,
category string,
pdp_ip string,
pdp_port string,
dns_ip string,
cust_browsed_ip string,
country string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION 'hdfs://xxxx-xxxxxx/ftplogs';
msck repair table logs_csv;
-- Lets now create an external table to Parquet format
CREATE EXTERNAL TABLE logs_parquet (
date_time string,
category string,
pdp_ip string,
pdp_port string,
dns_ip string,
cust_browsed_ip string,
country string)
STORED AS PARQUET
LOCATION 'hdfs://xxxx-xxxxx/logsparquet';
--Time to convert and export. This step will run for a long time, depending on your data size and cluster size.
INSERT OVERWRITE TABLE logs_parquet SELECT date_time,category,pdp_ip,pdp_port,dns_ip,cust_browsed_ip,country FROM logs_csv