Work With External Table In Snowflake
Check out my courses in Udemy. Popular courses include building utility using Snowpark Python API to ingest CSV data as well as JSON data automatically without writing any DDL/DML statement and save 95% of manual effort.
This tutorial “How to work with external table in Snowflake” hands on guide is going to demonstrate you everything about snowflake external table and how to connect to your cloud data lake and see your CSV, Parquet, ORC, Avro, JSON & xml data set.
This hands on visual guided will help you to answer following questions
1. What is external table in snowflake.
2. How to create external table and map it with external stage (like AWS S3)
3. How to auto ingest via external table.
4. How to work with parquet file and external table.
5. and many more
Check out my free and paid courses in Udemy
Sample Data Files (CSV, Parquet,ORC, JSON)
Part-1 SQL Scripts — External Table For CSV Files
-- lets create external stage in s3
drop stage s3_customer_csv;
create stage s3_customer_csv
url = 's3://my-s3-data-lake/customer/csv/'
comment = 'this customer parquet data';
-- desc the stage and validate the definition
-- and on_error parameters
desc stage s3_customer_csv;
-- lets list the stage to see all the files and path
list @s3_customer_csv;
-- create a csv file format
create file format csv_ff
type = 'csv'
compression = 'auto'
field_delimiter = ','
record_delimiter = '\n'
skip_header = 0
field_optionally_enclosed_by = '\042'
null_if = ('\\N');
-- select external stage using $ notation
select t.$1, t.$2, t.$3,t.$4, t.$5, t.$6 , t.$7, t.$8
from
@s3_customer_csv (file_format => 'csv_ff') t;
-- what if you give one. extra colum in your $ notation
select t.$1, t.$2, t.$3,t.$4, t.$5, t.$6 , t.$7, t.$8, t.$9
from
@s3_customer_csv (file_format => 'csv_ff') t;
// it will appear null, so be careful with such column.
create or replace external TABLE customer_csv_et (
CUST_KEY varchar AS (value:c1::varchar),
NAME varchar AS (value:c2::varchar),
ADDRESS varchar AS (value:c3::varchar),
NATION_KEY varchar AS (value:c4::varchar),
PHONE varchar AS (value:c5::varchar),
ACCOUNT_BALANCE varchar AS (value:c6::varchar),
MARKET_SEGMENT varchar AS (value:c7::varchar),
COMMENT varchar AS (value:c8::varchar)
)
with location=@s3_customer_csv
auto_refresh = false
file_format = (format_name = csv_ff)
;
-- lets see the extra value column and it is available in json format.
select * from customer_csv_et;
create or replace external TABLE customer_csv_et_dummy (
)
with location=@s3_customer_csv
auto_refresh = false
file_format = (format_name = csv_ff);
-- select the table
select * from customer_csv_et;
-- fetch the value column and metadata column
select value, metadata$filename from customer_csv_et;
-- select clause
select value, metadata$filename from customer_csv_et where metadata$filename ='customer/csv/customer_003.csv';
-- if you are not sure about the column names etc, then you can simply add a dummy column and check all values
create or replace external TABLE customer_csv_et_dummy (
col1 varchar AS (value:c1::varchar)
)
with location=@s3_customer_csv
auto_refresh = false
file_format = (format_name = csv_ff);
select * from customer_csv_et_dummy;
-- clean stuff
Part-2 SQL Scripts — Table Partition
-- lets create external stage in s3
drop stage s3_customer_csv_partition;
create stage s3_customer_csv_partition
url = 's3://my-s3-data-lake/partition/segment/'
comment = 'this partition data for customer table by market segment';
-- desc the stage and validate the definition
-- and on_error parameters
desc stage s3_customer_csv_partition;
-- lets list the stage to see all the files and path
list @s3_customer_csv_partition;
-- select external stage using $ notation
select t.$1, t.$2, t.$3,t.$4, t.$5, t.$6 , t.$7, t.$8
from
@s3_customer_csv_partition (file_format => 'csv_ff') t;
-- check the partition files
select metadata$filename from @s3_customer_csv_partition;
select split_part(metadata$filename, '/', 3) from @s3_customer_csv_partition;
// it will appear null, so be careful with such column.
create or replace external TABLE customer_partition_et (
partition_col varchar AS split_part(metadata$filename, '/', 3),
CUST_KEY varchar AS (value:c1::varchar),
NAME varchar AS (value:c2::varchar),
ADDRESS varchar AS (value:c3::varchar),
NATION_KEY varchar AS (value:c4::varchar),
PHONE varchar AS (value:c5::varchar),
ACCOUNT_BALANCE varchar AS (value:c6::varchar),
MARKET_SEGMENT varchar AS (value:c7::varchar),
COMMENT varchar AS (value:c8::varchar)
)
partition by (partition_col)
with location=@s3_customer_csv_partition
file_format = (format_name = csv_ff)
;
create or replace external TABLE customer_no_partition_et (
CUST_KEY varchar AS (value:c1::varchar),
NAME varchar AS (value:c2::varchar),
ADDRESS varchar AS (value:c3::varchar),
NATION_KEY varchar AS (value:c4::varchar),
PHONE varchar AS (value:c5::varchar),
ACCOUNT_BALANCE varchar AS (value:c6::varchar),
MARKET_SEGMENT varchar AS (value:c7::varchar),
COMMENT varchar AS (value:c8::varchar)
)
with location=@s3_customer_csv_partition
file_format = (format_name = csv_ff)
;
-- this updates the metadata
alter external table customer_partition_et refresh;
alter external table customer_csv_et refresh;
-- byte scanned compare & time compare
select * from customer_partition_et where partition_col='MACHINERY';
select * from customer_csv_et where MARKET_SEGMENT='MACHINERY';
select * from customer_partition_et where partition_col='FURNITURE';
select * from customer_no_partition_et where MARKET_SEGMENT='FURNITURE';
Part-3 SQL Scripts — External Table & Streams
-- create stage
drop stage s3_customer_stream;
create stage s3_customer_stream
url = 's3://my-s3-data-lake/customer_stream/'
comment = 'this customer csv data with auto-refresh true';
list @s3_customer_stream;
-- create external table with auto refresh flag true
create or replace external TABLE customer_stream (
CUST_KEY varchar AS (value:c1::varchar),
NAME varchar AS (value:c2::varchar),
ADDRESS varchar AS (value:c3::varchar),
NATION_KEY varchar AS (value:c4::varchar),
PHONE varchar AS (value:c5::varchar),
ACCOUNT_BALANCE varchar AS (value:c6::varchar),
MARKET_SEGMENT varchar AS (value:c7::varchar),
COMMENT varchar AS (value:c8::varchar)
)
with location=@s3_customer_stream
auto_refresh = true
file_format = (format_name = csv_ff)
;
-- check how many records we hae
select count(*) from customer_stream; -- ? records
-- check the history function
select * from table(information_schema.EXTERNAL_TABLE_FILES(table_name=>'customer_stream'));
-- add 5 more files
-- check file count within stage
list @s3_customer_stream;
-- add more files & check list
list @s3_customer_stream;
-- check record count
select count(*) from customer_stream; -- ? record
-- run this
alter external table customer_stream refresh;
-- now records will be added
select count(*) from customer_stream;
Part-4 SQL Scripts — Parquet File Format For External Table
-- stage with parquet as file format
create or replace stage s3_etg_customer_parquet_ff
url = 's3://my-s3-data-lake/customer/parquet/'
file_format = (type='parquet' compression='snappy')
comment = 'this customer parquet data stage with file format attacched with it';
-- s3 external stage without any file format attached with it
create stage s3_etg_customer_parquet
url = 's3://my-s3-data-lake/customer/parquet/'
comment = 'this customer parquet data';
-- lets create external stage in s3
drop stage s3_customer_parquet;
create stage s3_customer_parquet
url = 's3://my-s3-data-lake/customer/parquet/'
comment = 'this customer parquet data';
list @s3_customer_parquet;
desc stage s3_customer_parquet;
-- following will give only json value
-- you can see the column header is $1
select * from @s3_etg_customer_parquet_ff;
-- get all the file names
select metadata$filename from @s3_etg_customer_parquet_ff;
-- value will not work in this approachc
select value,metadata$filename from @s3_etg_customer_parquet_ff;
select $1:CUSTOMER_KEY from @s3_etg_customer_parquet_ff;
-- case sensitive
select $1:customer_key from @s3_etg_customer_parquet_ff;
-- any other field which does not exist
select $1:unknow_key from @s3_etg_customer_parquet_ff;
select
$1:CUSTOMER_KEY::varchar,
$1:NAME::varchar,
$1:ADDRESS::varchar,
$1:COUNTRY_KEY::varchar,
$1:PHONE::varchar,
$1:ACCT_BAL::decimal(10,2),
$1:MKT_SEGMENT::varchar,
$1:COMMENT::varchar
from @s3_etg_customer_parquet_ff;
-- following will throw error
create or replace external TABLE customer_par_ff (
CUST_KEY varchar AS ($1::varchar)
)
with location=@s3_etg_customer_parquet_ff;
--error if you put c1 or c2 stuff
create or replace external TABLE customer_par_ff (
CUST_KEY varchar AS (value:c1::varchar)
)
with location=@s3_etg_customer_parquet_ff
file_format = (format_name = parquet_ff);
-- following is the correct approach
create or replace external TABLE customer_par_ff (
CUST_KEY varchar AS ($1:CUSTOMER_KEY::varchar)
)
with location=@s3_etg_customer_parquet_ff
file_format = (format_name = parquet_ff);
create or replace external TABLE customer_par_ff (
CUST_KEY varchar AS ($1:CUSTOMER_KEY::varchar),
NAME varchar AS ($1:NAME::varchar),
ADDRESS varchar AS ($1:ADDRESS::varchar),
NATION_KEY varchar AS ($1:COUNTRY_KEY::varchar),
PHONE varchar AS ($1:PHONE::varchar),
ACCOUNT_BALANCE varchar AS ($1:ACCT_BAL::varchar),
MARKET_SEGMENT varchar AS ($1:MKT_SEGMENT::varchar),
COMMENT varchar AS ($1:COMMENT::varchar)
)
with location=@s3_etg_customer_parquet_ff
file_format = (format_name = parquet_ff);
Part-5 SQL Scripts — ORC File Format For External Table
-- stage with orc as file format
CREATE STAGE s3_stg_customer_orc
URL = 's3://my-s3-data-lake/customer/orc/'
file_format = orc_ff
COMMENT = 'This customer orc data';
--list the stage files
list @s3_stg_customer_orc;
--desc the stage
desc stage s3_stg_customer_orc;
--select orc external stage
select * from @s3_stg_customer_orc;
create or replace external TABLE customer_orc (
CUST_KEY varchar AS ($1:CUSTOMER_KEY::varchar),
NAME varchar AS ($1:NAME::varchar),
ADDRESS varchar AS ($1:ADDRESS::varchar),
NATION_KEY varchar AS ($1:COUNTRY_KEY::varchar),
PHONE varchar AS ($1:PHONE::varchar),
ACCOUNT_BALANCE varchar AS ($1:ACCT_BAL::varchar),
MARKET_SEGMENT varchar AS ($1:MKT_SEGMENT::varchar),
COMMENT varchar AS ($1:COMMENT::varchar)
)
with location=@s3_stg_customer_orc
auto_refresh = false
file_format = (format_name = orc_ff)
;
-- lets select the table
select * from customer_orc;
create or replace materialized view my_mv_on_et as
select CUST_KEY, name, address, nation_key, phone, account_balance, market_segment, comment from customer_orc;
select * from my_mv_on_et;
Part-6 SQL Scripts — Error Handling
-- lets create external stage in s3 with csv format
create stage s3_customer_error
url = 's3://my-s3-data-lake/customer_with_error/'
comment = 'this customer csv data also having orc and parquet';
-- desc the stage and validate the definition
-- and on_error parameters
desc stage s3_customer_error;
-- lets list the stage to see all the files and path
list @s3_customer_error;
create or replace external TABLE customer_orc_error (
CUST_KEY varchar AS ($1:CUSTOMER_KEY::varchar),
NAME varchar AS ($1:NAME::varchar),
ADDRESS varchar AS ($1:ADDRESS::varchar),
NATION_KEY varchar AS ($1:COUNTRY_KEY::varchar),
PHONE varchar AS ($1:PHONE::varchar),
ACCOUNT_BALANCE varchar AS ($1:ACCT_BAL::varchar),
MARKET_SEGMENT varchar AS ($1:MKT_SEGMENT::varchar),
COMMENT varchar AS ($1:COMMENT::varchar)
)
with location=@s3_customer_error
auto_refresh = false
file_format = (format_name = orc_ff)
;
select * from customer_orc_error;
select count(*) from customer_orc_error;
-- list files
list @s3_customer_error;
-- check table count
select count(*) from customer_orc_error;
-- check the registration
select * from table(information_schema.external_table_file_registration_history(table_name=>'customer_orc_error'));
-- refresh table via alter statement
alter external table customer_orc_error refresh;
-- metadata history
select * from table(information_schema.external_table_file_registration_history(table_name=>'customer_orc_error'));
-- check table count
select count(*) from customer_orc_error;
select * from customer_orc_error;
select *
from table(information_schema.external_table_file_registration_history(
start_time=>dateadd('hour',-1,current_timestamp()),
table_name=>'customer_orc_error'));
Part-7 SQL Scripts — External Table Pipe Status
select system$external_table_pipe_status('customer_stream');
select * from
table(
information_schema.external_table_files(table_name=>'customer_csv_et')
);
select *
from table(information_schema.external_table_file_registration_history(table_name=>'customer_csv_et'));
-- Retrieve the registration events for external table customer_stream that started within the last hour:
select *
from table(information_schema.external_table_file_registration_history(
start_time=>dateadd('hour',-48,current_timestamp()),
table_name=>'customer_stream'));
select *
from table(information_schema.auto_refresh_registration_history(
date_range_start=>to_timestamp_tz('2021-11-01 12:00:00.000 -0700'),
date_range_end=>to_timestamp_tz('2021-11-08 12:30:00.000 -0700'),
object_type=>'external_table'));
-- Same as the previous example, but retrieves the billing history for the last 14 days, in 1 day periods:
select *
from table(information_schema.auto_refresh_registration_history(
date_range_start=>dateadd('day',-14,current_date()),
date_range_end=>current_date(),
object_type=>'external_table'));
-- Retrieve the billing history for an external table named myexttable in the active schema in the session for the last 12 hours, in 1 hour periods:
select *
from table(information_schema.auto_refresh_registration_history(
date_range_start=>dateadd('hour',-12,current_timestamp()),
object_type=>'external_table',
object_name=>'customer_csv_et'));