Understand Amazon Redshift Query Caching

2018年11月02日

*
查找当前Redshift database的'public' schema下的所有table的名称

SELECT distinct(tablename) 
FROM pg_table_def 
WHERE schemaname = 'public';

Result:

tablename

aircraft

airports

flights

vegas_flights

*
关于SELECT的详细语法，参见：SELECT
https://docs.aws.amazon.com/redshift/latest/dg/r_SELECT_synopsis.html
*
DROP掉之前实验遗留的表

-- BE CAREFUL WHEN DROP TABLES IN PROD! --
DROP TABLE IF EXISTS aircraft;
DROP TABLE IF EXISTS airports;
DROP TABLE IF EXISTS flights;
DROP TABLE IF EXISTS vegas_flights;

*
关于DROP TABLE的详细语法，参见：DROP TABLE
https://docs.aws.amazon.com/redshift/latest/dg/r_DROP_TABLE.html

再次执行SQL query，查看table清理结果。

SELECT distinct(tablename) 
FROM pg_table_def 
WHERE schemaname = 'public';

CREATE TABLE part 
(
  p_partkey     INTEGER NOT NULL,
  p_name        VARCHAR(22) NOT NULL,
  p_mfgr        VARCHAR(6) NOT NULL,
  p_category    VARCHAR(7) NOT NULL,
  p_brand1      VARCHAR(9) NOT NULL,
  p_color       VARCHAR(11) NOT NULL,
  p_type        VARCHAR(25) NOT NULL,
  p_size        INTEGER NOT NULL,
  p_container   VARCHAR(10) NOT NULL
);
 
CREATE TABLE supplier 
(
  s_suppkey   INTEGER NOT NULL,
  s_name      VARCHAR(25) NOT NULL,
  s_address   VARCHAR(25) NOT NULL,
  s_city      VARCHAR(10) NOT NULL,
  s_nation    VARCHAR(15) NOT NULL,
  s_region    VARCHAR(12) NOT NULL,
  s_phone     VARCHAR(15) NOT NULL
);
 
CREATE TABLE customer 
(
  c_custkey      INTEGER NOT NULL,
  c_name         VARCHAR(25) NOT NULL,
  c_address      VARCHAR(25) NOT NULL,
  c_city         VARCHAR(10) NOT NULL,
  c_nation       VARCHAR(15) NOT NULL,
  c_region       VARCHAR(12) NOT NULL,
  c_phone        VARCHAR(15) NOT NULL,
  c_mktsegment   VARCHAR(10) NOT NULL
);
 
CREATE TABLE dwdate 
(
  d_datekey            INTEGER NOT NULL,
  d_date               VARCHAR(19) NOT NULL,
  d_dayofweek          VARCHAR(10) NOT NULL,
  d_month              VARCHAR(10) NOT NULL,
  d_year               INTEGER NOT NULL,
  d_yearmonthnum       INTEGER NOT NULL,
  d_yearmonth          VARCHAR(8) NOT NULL,
  d_daynuminweek       INTEGER NOT NULL,
  d_daynuminmonth      INTEGER NOT NULL,
  d_daynuminyear       INTEGER NOT NULL,
  d_monthnuminyear     INTEGER NOT NULL,
  d_weeknuminyear      INTEGER NOT NULL,
  d_sellingseason      VARCHAR(13) NOT NULL,
  d_lastdayinweekfl    VARCHAR(1) NOT NULL,
  d_lastdayinmonthfl   VARCHAR(1) NOT NULL,
  d_holidayfl          VARCHAR(1) NOT NULL,
  d_weekdayfl          VARCHAR(1) NOT NULL
);
CREATE TABLE lineorder 
(
  lo_orderkey          INTEGER NOT NULL,
  lo_linenumber        INTEGER NOT NULL,
  lo_custkey           INTEGER NOT NULL,
  lo_partkey           INTEGER NOT NULL,
  lo_suppkey           INTEGER NOT NULL,
  lo_orderdate         INTEGER NOT NULL,
  lo_orderpriority     VARCHAR(15) NOT NULL,
  lo_shippriority      VARCHAR(1) NOT NULL,
  lo_quantity          INTEGER NOT NULL,
  lo_extendedprice     INTEGER NOT NULL,
  lo_ordertotalprice   INTEGER NOT NULL,
  lo_discount          INTEGER NOT NULL,
  lo_revenue           INTEGER NOT NULL,
  lo_supplycost        INTEGER NOT NULL,
  lo_tax               INTEGER NOT NULL,
  lo_commitdate        INTEGER NOT NULL,
  lo_shipmode          VARCHAR(10) NOT NULL
);

*
关于CREATE TABLE的详细语法，参见：CREATE TABLE
https://docs.aws.amazon.com/redshift/latest/dg/r_CREATE_TABLE_NEW.html
*

COPY customer 
FROM 's3://<YOURBUCKETNAME>/ssbgz/customer' 
credentials 'aws_iam_role=arn:aws-cn:iam::<aws-account-id>:role/<role-name>' 
gzip compupdate off region 'cn-northwest-1';
 
COPY dwdate 
FROM 's3://<YOURBUCKETNAME>/ssbgz/dwdate' 
credentials 'aws_iam_role=arn:aws-cn:iam::<aws-account-id>:role/<role-name>' 
gzip compupdate off region 'cn-northwest-1';
 
COPY lineorder 
FROM 's3://<YOURBUCKETNAME>/ssbgz/lineorder' 
credentials 'aws_iam_role=arn:aws-cn:iam::<aws-account-id>:role/<role-name>' 
gzip compupdate off region 'cn-northwest-1';
 
COPY part 
FROM 's3://<YOURBUCKETNAME>/ssbgz/part' 
credentials 'aws_iam_role=arn:aws-cn:iam::<aws-account-id>:role/<role-name>' 
gzip compupdate off region 'cn-northwest-1';
 
COPY supplier 
FROM 's3://<YOURBUCKETNAME>/ssbgz/supplier' 
credentials 'aws_iam_role=arn:aws-cn:iam::<aws-account-id>:role/<role-name>' 
gzip compupdate off region 'cn-northwest-1';

In my environment:

copy customer from 's3://skycone-lab-us-west-2/redshift/load/customer-fw-manifest'
iam_role 'arn:aws:iam::123456789012:role/myRedshiftRole'
fixedwidth 'c_custkey:10, c_name:25, c_address:25, c_city:10, c_nation:15, c_region:12, c_phone:15, c_mktsegment:10'
maxerror 10
acceptinvchars as '^'
manifest;

copy part from 's3://skycone-lab-us-west-2/redshift/load/part-csv.tbl'
credentials 'aws_access_key_id=AKIA***BK45;aws_secret_access_key=tGNwy7****tSma'
csv;

copy dwdate from 's3://skycone-lab-us-west-2/redshift/load/dwdate-tab.tbl'
iam_role 'arn:aws:iam::123456789012:role/myRedshiftRole'
delimiter '\t'
dateformat 'auto';

copy lineorder from 's3://skycone-lab-us-west-2/redshift/load/lo/lineorder-single.tbl'
iam_role 'arn:aws:iam::123456789012:role/myRedshiftRole'
gzip
compupdate off
region 'us-west-2';

copy supplier from 's3://awssampledbuswest2/ssbgz/supplier.tbl'
iam_role 'arn:aws:iam::123456789012:role/myRedshiftRole'
delimiter '|'
gzip
region 'us-west-2';

Replace the <aws-account-id> with your AWS account ID for your testing environment.
Replace the <role-name> with your IAM role name that is attached to the Redshift for this test.
在我的实验环境使用的是myRedshiftRole

关于COPY的详细语法，参见：COPY
https://docs.aws.amazon.com/redshift/latest/dg/r_COPY.html

Create some users for later verification and test

-- DO NOT USE SIMPLY PASSWORD IN PROD --

CREATE USER leo 
PASSWORD 'ABCd4321';

CREATE USER matt 
PASSWORD 'ABCd4321';

CREATE USER fred 
PASSWORD 'ABCd4321';

关于CREATE USER的详细语法，参见：CREATE USER
https://docs.aws.amazon.com/redshift/latest/dg/r_CREATE_USER.html

-- DO NOT GRANT TOO LOOSE privilege IN PROD! --

GRANT ALL PRIVILEGES 
ON ALL TABLES IN SCHEMA public
TO leo;

GRANT ALL PRIVILEGES 
ON ALL TABLES IN SCHEMA public
TO matt;

GRANT SELECT, INSERT  
ON TABLE customer, dwdate, lineorder, part, supplier
TO fred;

关于GRANT的详细语法，参见：GRANT
https://docs.aws.amazon.com/redshift/latest/dg/r_GRANT.html

在Redshift的Query editor里，enable_result_cache_for_session的作用域测试下来感觉是在各个Query Tab里面。

Result caching is enabled by default. To disable result caching for the current session, set the enable_result_cache_for_session parameter to off.

Change user to "matt".

-

SET 
enable_result_cache_for_session TO 
OFF;

Run the following queries twice to eliminate compile time. Record the execution time used for each query.

Run the SQL statement for the first time.

select sum(qtysold) from sales, date
where sales.dateid = date.dateid and date.month = 'APR';

Returned result when run the SQL statement for the first time:

Execution time: 0.11s
Elapsed time: 4337 ms

Run the SQL statement for the second time.

select sum(qtysold) from sales, date
where sales.dateid = date.dateid and date.month = 'APR';

Returned result when run the SQL statement for the second time:

Execution time: 0s
Elapsed time: 128 ms

Force enable query cache:

SET 
enable_result_cache_for_session TO 
ON;

Run the SQL statement for the 3rd time.

select sum(qtysold) from sales, date
where sales.dateid = date.dateid and date.month = 'APR';

Result:

Elapsed time: 113 ms

Change user to "leo".
Run the same SQL statement.

select sum(qtysold) from sales, date
where sales.dateid = date.dateid and date.month = 'APR';

Result:

ELAPSED TIME: 00 m 28 s
Elapsed time: 9 ms

Below SQL command should run with 'master' ('awsuser') user, at least not with a normal user.

SELECT userid, query, elapsed, source_query 
FROM svl_qlog 
WHERE userid > 1
ORDER BY query DESC;

Result (I intentionally hid some irrelevant returned rows).

userid	query	elapsed	source_query
104	11923	47	11906
103	11906	15870

...
也就是说，leo（usesysid 103）后来的那个SQL query的source_query是matt（usesysid 104）之前的那条SQL查询。

SELECT * 
FROM pg_user
WHERE usesysid >= 100;

Result:

usename	usesysid	usecreatedb	usesuper	usecatupd	passwd	valuntil	useconfig
master	100	TRUE	TRUE	FALSE	********
leo	103	FALSE	FALSE	FALSE	********
matt	104	FALSE	FALSE	FALSE	********
fred	105	FALSE	FALSE	FALSE	********

关于USERS Table的详细信息，参见：USERS Table
https://docs.aws.amazon.com/redshift/latest/dg/r_userstable.html

Column Name	Data Type	Description
USERID	INTEGER	Primary key, a unique ID value for each row. Each row represents a registered user (a buyer or seller or both) who has listed or bought tickets for at least one event.
USERNAME	CHAR(8)	An 8-character alphanumeric username, such as `PGL08LJI`.
FIRSTNAME	VARCHAR(30)	The user's first name, such as `Victor`.
LASTNAME	VARCHAR(30)	The user's last name, such as `Hernandez`.
CITY	VARCHAR(30)	The user's home city, such as `Naperville`.
STATE	CHAR(2)	The user's home state, such as `GA`.
EMAIL	VARCHAR(100)	The user's email address; this column contains random Latin values, such as `turpis@accumsanlaoreet.org`.
PHONE	CHAR(14)	The user's 14-character phone number, such as `(818) 765-4255`.
LIKESPORTS, ...	BOOLEAN	A series of 10 different columns that identify the user's likes and dislikes with `true` and `false` values.

Run below SQL command using 'master' user.

SELECT pg_user.usename, svl_qlog.*
FROM svl_qlog left outer join pg_user on pg_user.usesysid = svl_qlog.userid
WHERE pg_user.usesysid >= 100;

Result:

usename

userid

query

xid

pid

starttime

endtime

elapsed

aborted

label

substring

source_query

concurrency_scaling_status_txt

from_sp_call

leo

101

350

2730

1073881432

01:35.4

100

default

select sum(qtysold) from sales, date where sales.dateid = da

333

0 - Ran on the main cluster

NULL

matt

102

333

2655

1073955173

00:18.2

84009

default

select sum(qtysold) from sales, date where sales.dateid = da

NULL

0 - Ran on the main cluster

NULL

SELECT clause could use tablename.* format to specify all columns under one table

To make the result more concise:

SELECT pg_user.usename, svl_qlog.query, svl_qlog.elapsed, svl_qlog.source_query, svl_qlog.substring
FROM svl_qlog LEFT OUTER JOIN pg_user ON pg_user.usesysid = svl_qlog.userid
WHERE pg_user.usesysid >= 100
ORDER BY query DESC;

Result:

usename	query	elapsed	source_query	substring
leo	350	100	333	select sum(qtysold) from sales, date where sales.dateid = da
matt	333	84009	NULL	select sum(qtysold) from sales, date where sales.dateid = da

EXPLAIN
SELECT pg_user.usename, svl_qlog.query, svl_qlog.elapsed, svl_qlog.source_query, svl_qlog.substring
FROM svl_qlog LEFT OUTER JOIN pg_user ON pg_user.usesysid = svl_qlog.userid
WHERE pg_user.usesysid >= 100
ORDER BY query DESC;

Result:

QUERY PLAN

XN Merge (cost=1000649840076.10..1000649840076.19 rows=35 width=12156)

Merge Key: stll_query.query

-> XN Network (cost=1000649840076.10..1000649840076.19 rows=35 width=12156)

Send to leader

-> XN Sort (cost=1000649840076.10..1000649840076.19 rows=35 width=12156)

Sort Key: stll_query.query

-> XN Hash Join DS_BCAST_INNER (cost=23.26..649840075.21 rows=35 width=12156)

Hash Cond: ("outer".userid = "inner".usesysid)

-> XN Hash Left Join DS_BCAST_INNER (cost=18.35..505640068.37 rows=105 width=12032)

Hash Cond: ("outer".query = "inner".query)

-> XN Hash Left Join DS_BCAST_INNER (cost=2.98..358040031.00 rows=17 width=12032)

Hash Cond: (("outer".query = "inner".query) AND ("outer".userid = "inner".userid) AND ("outer".xid = "inner".xid))

-> XN Hash Right Join DS_DIST_BOTH (cost=0.00..317240027.75 rows=7 width=12040)

Outer Dist Key: stll_result_cache_history.cache_hit_query

Inner Dist Key: stll_query.query

Hash Cond: ("outer".cache_hit_query = "inner".query)

-> XN Seq Scan on stll_result_cache_history (cost=0.00..12.30 rows=1230 width=8)

-> XN Hash (cost=0.00..0.00 rows=1 width=12036)

-> XN Seq Scan on stll_query (cost=0.00..0.00 rows=1 width=12036)

-> XN Hash (cost=1.70..1.70 rows=170 width=16)

-> XN Seq Scan on stll_wlm_query (cost=0.00..1.70 rows=170 width=16)

-> XN Hash (cost=12.30..12.30 rows=1230 width=4)

-> XN Seq Scan on stll_stored_proc_call_map (cost=0.00..12.30 rows=1230 width=4)

-> XN Hash (cost=4.65..4.65 rows=103 width=132)

-> LD Seq Scan on pg_shadow (cost=0.00..4.65 rows=103 width=132)

Filter: ((usesysid >= 100) AND (usename !~~ 'f346c9b8%'::text))

REVOKE本次实验创建的用户的权限

REVOKE 
ALL 
ON ALL TABLES IN SCHEMA public
FROM leo,matt;

REVOKE 
ALL 
ON TABLE customer, dwdate, lineorder, part, supplier
FROM fred;

关于REVOKE的详细信息，参见：REVOKE
https://docs.aws.amazon.com/redshift/latest/dg/r_REVOKE.html

删除本次实验创建的用户

DROP USER IF EXISTS leo, matt, fred;

关于DROP USER的详细信息，参见：DROP USER
https://docs.aws.amazon.com/redshift/latest/dg/r_DROP_USER.html

查看用户（除rdsdb和master）

SELECT * 
FROM pg_user
WHERE usesysid > 100;

References

Result caching

Category: data Tags: Redshift public

Sky Cone

Understand Amazon Redshift Query Caching