Understand Amazon Redshift Query Caching

2018年11月02日

*
查找当前Redshift database的'public' schema下的所有table的名称

SELECT distinct(tablename) 
FROM pg_table_def 
WHERE schemaname = 'public';
Result:
tablename
aircraft
airports
flights
vegas_flights
*
关于SELECT的详细语法,参见:SELECT
https://docs.aws.amazon.com/redshift/latest/dg/r_SELECT_synopsis.html
*
DROP掉之前实验遗留的表
-- BE CAREFUL WHEN DROP TABLES IN PROD! --
DROP TABLE IF EXISTS aircraft;
DROP TABLE IF EXISTS airports;
DROP TABLE IF EXISTS flights;
DROP TABLE IF EXISTS vegas_flights;
*
关于DROP TABLE的详细语法,参见:DROP TABLE
https://docs.aws.amazon.com/redshift/latest/dg/r_DROP_TABLE.html

再次执行SQL query,查看table清理结果。
SELECT distinct(tablename) 
FROM pg_table_def 
WHERE schemaname = 'public';

CREATE TABLE part 
(
  p_partkey     INTEGER NOT NULL,
  p_name        VARCHAR(22) NOT NULL,
  p_mfgr        VARCHAR(6) NOT NULL,
  p_category    VARCHAR(7) NOT NULL,
  p_brand1      VARCHAR(9) NOT NULL,
  p_color       VARCHAR(11) NOT NULL,
  p_type        VARCHAR(25) NOT NULL,
  p_size        INTEGER NOT NULL,
  p_container   VARCHAR(10) NOT NULL
);
 
CREATE TABLE supplier 
(
  s_suppkey   INTEGER NOT NULL,
  s_name      VARCHAR(25) NOT NULL,
  s_address   VARCHAR(25) NOT NULL,
  s_city      VARCHAR(10) NOT NULL,
  s_nation    VARCHAR(15) NOT NULL,
  s_region    VARCHAR(12) NOT NULL,
  s_phone     VARCHAR(15) NOT NULL
);
 
CREATE TABLE customer 
(
  c_custkey      INTEGER NOT NULL,
  c_name         VARCHAR(25) NOT NULL,
  c_address      VARCHAR(25) NOT NULL,
  c_city         VARCHAR(10) NOT NULL,
  c_nation       VARCHAR(15) NOT NULL,
  c_region       VARCHAR(12) NOT NULL,
  c_phone        VARCHAR(15) NOT NULL,
  c_mktsegment   VARCHAR(10) NOT NULL
);
 
CREATE TABLE dwdate 
(
  d_datekey            INTEGER NOT NULL,
  d_date               VARCHAR(19) NOT NULL,
  d_dayofweek          VARCHAR(10) NOT NULL,
  d_month              VARCHAR(10) NOT NULL,
  d_year               INTEGER NOT NULL,
  d_yearmonthnum       INTEGER NOT NULL,
  d_yearmonth          VARCHAR(8) NOT NULL,
  d_daynuminweek       INTEGER NOT NULL,
  d_daynuminmonth      INTEGER NOT NULL,
  d_daynuminyear       INTEGER NOT NULL,
  d_monthnuminyear     INTEGER NOT NULL,
  d_weeknuminyear      INTEGER NOT NULL,
  d_sellingseason      VARCHAR(13) NOT NULL,
  d_lastdayinweekfl    VARCHAR(1) NOT NULL,
  d_lastdayinmonthfl   VARCHAR(1) NOT NULL,
  d_holidayfl          VARCHAR(1) NOT NULL,
  d_weekdayfl          VARCHAR(1) NOT NULL
);
CREATE TABLE lineorder 
(
  lo_orderkey          INTEGER NOT NULL,
  lo_linenumber        INTEGER NOT NULL,
  lo_custkey           INTEGER NOT NULL,
  lo_partkey           INTEGER NOT NULL,
  lo_suppkey           INTEGER NOT NULL,
  lo_orderdate         INTEGER NOT NULL,
  lo_orderpriority     VARCHAR(15) NOT NULL,
  lo_shippriority      VARCHAR(1) NOT NULL,
  lo_quantity          INTEGER NOT NULL,
  lo_extendedprice     INTEGER NOT NULL,
  lo_ordertotalprice   INTEGER NOT NULL,
  lo_discount          INTEGER NOT NULL,
  lo_revenue           INTEGER NOT NULL,
  lo_supplycost        INTEGER NOT NULL,
  lo_tax               INTEGER NOT NULL,
  lo_commitdate        INTEGER NOT NULL,
  lo_shipmode          VARCHAR(10) NOT NULL
);
*
关于CREATE TABLE的详细语法,参见:CREATE TABLE
https://docs.aws.amazon.com/redshift/latest/dg/r_CREATE_TABLE_NEW.html
*
COPY customer 
FROM 's3://<YOURBUCKETNAME>/ssbgz/customer' 
credentials 'aws_iam_role=arn:aws-cn:iam::<aws-account-id>:role/<role-name>' 
gzip compupdate off region 'cn-northwest-1';
 
COPY dwdate 
FROM 's3://<YOURBUCKETNAME>/ssbgz/dwdate' 
credentials 'aws_iam_role=arn:aws-cn:iam::<aws-account-id>:role/<role-name>' 
gzip compupdate off region 'cn-northwest-1';
 
COPY lineorder 
FROM 's3://<YOURBUCKETNAME>/ssbgz/lineorder' 
credentials 'aws_iam_role=arn:aws-cn:iam::<aws-account-id>:role/<role-name>' 
gzip compupdate off region 'cn-northwest-1';
 
COPY part 
FROM 's3://<YOURBUCKETNAME>/ssbgz/part' 
credentials 'aws_iam_role=arn:aws-cn:iam::<aws-account-id>:role/<role-name>' 
gzip compupdate off region 'cn-northwest-1';
 
COPY supplier 
FROM 's3://<YOURBUCKETNAME>/ssbgz/supplier' 
credentials 'aws_iam_role=arn:aws-cn:iam::<aws-account-id>:role/<role-name>' 
gzip compupdate off region 'cn-northwest-1';

In my environment:
copy customer from 's3://skycone-lab-us-west-2/redshift/load/customer-fw-manifest'
iam_role 'arn:aws:iam::123456789012:role/myRedshiftRole'
fixedwidth 'c_custkey:10, c_name:25, c_address:25, c_city:10, c_nation:15, c_region:12, c_phone:15, c_mktsegment:10'
maxerror 10
acceptinvchars as '^'
manifest;

copy part from 's3://skycone-lab-us-west-2/redshift/load/part-csv.tbl'
credentials 'aws_access_key_id=AKIA***BK45;aws_secret_access_key=tGNwy7****tSma'
csv;

copy dwdate from 's3://skycone-lab-us-west-2/redshift/load/dwdate-tab.tbl'
iam_role 'arn:aws:iam::123456789012:role/myRedshiftRole'
delimiter '\t'
dateformat 'auto';

copy lineorder from 's3://skycone-lab-us-west-2/redshift/load/lo/lineorder-single.tbl'
iam_role 'arn:aws:iam::123456789012:role/myRedshiftRole'
gzip
compupdate off
region 'us-west-2';

copy supplier from 's3://awssampledbuswest2/ssbgz/supplier.tbl'
iam_role 'arn:aws:iam::123456789012:role/myRedshiftRole'
delimiter '|'
gzip
region 'us-west-2';

Replace the <aws-account-id> with your AWS account ID for your testing environment.
Replace the <role-name> with your IAM role name that is attached to the Redshift for this test.
在我的实验环境使用的是myRedshiftRole


关于COPY的详细语法,参见:COPY
https://docs.aws.amazon.com/redshift/latest/dg/r_COPY.html

Create some users for later verification and test
-- DO NOT USE SIMPLY PASSWORD IN PROD --

CREATE USER leo 
PASSWORD 'ABCd4321';

CREATE USER matt 
PASSWORD 'ABCd4321';

CREATE USER fred 
PASSWORD 'ABCd4321';

关于CREATE USER的详细语法,参见:CREATE USER
https://docs.aws.amazon.com/redshift/latest/dg/r_CREATE_USER.html

-- DO NOT GRANT TOO LOOSE privilege IN PROD! --

GRANT ALL PRIVILEGES 
ON ALL TABLES IN SCHEMA public
TO leo;

GRANT ALL PRIVILEGES 
ON ALL TABLES IN SCHEMA public
TO matt;

GRANT SELECT, INSERT  
ON TABLE customer, dwdate, lineorder, part, supplier
TO fred;

关于GRANT的详细语法,参见:GRANT
https://docs.aws.amazon.com/redshift/latest/dg/r_GRANT.html

在Redshift的Query editor里,enable_result_cache_for_session的作用域测试下来感觉是在各个Query Tab里面。

Result caching is enabled by default. To disable result caching for the current session, set the enable_result_cache_for_session parameter to off.

Change user to "matt".



-
SET 
enable_result_cache_for_session TO 
OFF;

Run the following queries twice to eliminate compile time. Record the execution time used for each query.

Run the SQL statement for the first time.
select sum(qtysold) from sales, date
where sales.dateid = date.dateid and date.month = 'APR';

Returned result when run the SQL statement for the first time:
Execution time: 0.11s
Elapsed time: 4337 ms

Run the SQL statement for the second time.
select sum(qtysold) from sales, date
where sales.dateid = date.dateid and date.month = 'APR';

Returned result when run the SQL statement for the second time:
Execution time: 0s
Elapsed time: 128 ms

Force enable query cache:
SET 
enable_result_cache_for_session TO 
ON;

Run the SQL statement for the 3rd time.
select sum(qtysold) from sales, date
where sales.dateid = date.dateid and date.month = 'APR';
Result:
Elapsed time: 113 ms

Change user to "leo".
Run the same SQL statement.
select sum(qtysold) from sales, date
where sales.dateid = date.dateid and date.month = 'APR';
Result:
ELAPSED TIME: 00 m 28 s
Elapsed time: 9 ms

Below SQL command should run with 'master' ('awsuser') user, at least not with a normal user.
SELECT userid, query, elapsed, source_query 
FROM svl_qlog 
WHERE userid > 1
ORDER BY query DESC;
Result (I intentionally hid some irrelevant returned rows).
userid query elapsed source_query
104 11923 47 11906
103 11906 15870  
...
也就是说,leo(usesysid 103)后来的那个SQL query的source_query是matt(usesysid 104)之前的那条SQL查询。
SELECT * 
FROM pg_user
WHERE usesysid >= 100;
Result:
usename usesysid usecreatedb usesuper usecatupd passwd valuntil useconfig
master 100 TRUE TRUE FALSE ********    
leo 103 FALSE FALSE FALSE ********    
matt 104 FALSE FALSE FALSE ********    
fred 105 FALSE FALSE FALSE ********    

关于USERS Table的详细信息,参见:USERS Table
https://docs.aws.amazon.com/redshift/latest/dg/r_userstable.html
Column NameData TypeDescription
USERID INTEGER Primary key, a unique ID value for each row. Each row represents a registered user (a buyer or seller or both) who has listed or bought tickets for at least one event.
USERNAME CHAR(8) An 8-character alphanumeric username, such as PGL08LJI.
FIRSTNAME VARCHAR(30) The user's first name, such as Victor.
LASTNAME VARCHAR(30) The user's last name, such as Hernandez.
CITY VARCHAR(30) The user's home city, such as Naperville.
STATE CHAR(2) The user's home state, such as GA.
EMAIL VARCHAR(100) The user's email address; this column contains random Latin values, such as turpis@accumsanlaoreet.org.
PHONE CHAR(14) The user's 14-character phone number, such as (818) 765-4255.
LIKESPORTS, ... BOOLEAN A series of 10 different columns that identify the user's likes and dislikes with true and false values.

Run below SQL command using 'master' user.
SELECT pg_user.usename, svl_qlog.*
FROM svl_qlog left outer join pg_user on pg_user.usesysid = svl_qlog.userid
WHERE pg_user.usesysid >= 100;
Result:
usename userid query xid pid starttime endtime elapsed aborted label substring source_query concurrency_scaling_status_txt from_sp_call
leo                                                                                                                             101 350 2730 1073881432 01:35.4 01:35.4 100 0 default                                                                                                                                                                                                                                                                                                                          select sum(qtysold) from sales, date where sales.dateid = da 333 0 - Ran on the main cluster NULL
matt                                                                                                                            102 333 2655 1073955173 00:18.2 00:18.2 84009 0 default                                                                                                                                                                                                                                                                                                                          select sum(qtysold) from sales, date where sales.dateid = da NULL 0 - Ran on the main cluster NULL

SELECT clause could use tablename.* format to specify all columns under one table

To make the result more concise:
SELECT pg_user.usename, svl_qlog.query, svl_qlog.elapsed, svl_qlog.source_query, svl_qlog.substring
FROM svl_qlog LEFT OUTER JOIN pg_user ON pg_user.usesysid = svl_qlog.userid
WHERE pg_user.usesysid >= 100
ORDER BY query DESC;
Result:

usename

query

elapsed

source_query

substring

leo                                                                                                                            

350

100

333

select sum(qtysold) from sales, date where sales.dateid = da

matt                                                                                                                           

333

84009

NULL

select sum(qtysold) from sales, date where sales.dateid = da


EXPLAIN
SELECT pg_user.usename, svl_qlog.query, svl_qlog.elapsed, svl_qlog.source_query, svl_qlog.substring
FROM svl_qlog LEFT OUTER JOIN pg_user ON pg_user.usesysid = svl_qlog.userid
WHERE pg_user.usesysid >= 100
ORDER BY query DESC;
Result:
QUERY PLAN
XN Merge  (cost=1000649840076.10..1000649840076.19 rows=35 width=12156)
  Merge Key: stll_query.query
  ->  XN Network  (cost=1000649840076.10..1000649840076.19 rows=35 width=12156)
        Send to leader
        ->  XN Sort  (cost=1000649840076.10..1000649840076.19 rows=35 width=12156)
              Sort Key: stll_query.query
              ->  XN Hash Join DS_BCAST_INNER  (cost=23.26..649840075.21 rows=35 width=12156)
                    Hash Cond: ("outer".userid = "inner".usesysid)
                    ->  XN Hash Left Join DS_BCAST_INNER  (cost=18.35..505640068.37 rows=105 width=12032)
                          Hash Cond: ("outer".query = "inner".query)
                          ->  XN Hash Left Join DS_BCAST_INNER  (cost=2.98..358040031.00 rows=17 width=12032)
                                Hash Cond: (("outer".query = "inner".query) AND ("outer".userid = "inner".userid) AND ("outer".xid = "inner".xid))
                                ->  XN Hash Right Join DS_DIST_BOTH  (cost=0.00..317240027.75 rows=7 width=12040)
                                      Outer Dist Key: stll_result_cache_history.cache_hit_query
                                      Inner Dist Key: stll_query.query
                                      Hash Cond: ("outer".cache_hit_query = "inner".query)
                                      ->  XN Seq Scan on stll_result_cache_history  (cost=0.00..12.30 rows=1230 width=8)
                                      ->  XN Hash  (cost=0.00..0.00 rows=1 width=12036)
                                            ->  XN Seq Scan on stll_query  (cost=0.00..0.00 rows=1 width=12036)
                                ->  XN Hash  (cost=1.70..1.70 rows=170 width=16)
                                      ->  XN Seq Scan on stll_wlm_query  (cost=0.00..1.70 rows=170 width=16)
                          ->  XN Hash  (cost=12.30..12.30 rows=1230 width=4)
                                ->  XN Seq Scan on stll_stored_proc_call_map  (cost=0.00..12.30 rows=1230 width=4)
                    ->  XN Hash  (cost=4.65..4.65 rows=103 width=132)
                          ->  LD Seq Scan on pg_shadow  (cost=0.00..4.65 rows=103 width=132)
                                Filter: ((usesysid >= 100) AND (usename !~~ 'f346c9b8%'::text))

REVOKE本次实验创建的用户的权限
REVOKE 
ALL 
ON ALL TABLES IN SCHEMA public
FROM leo,matt;

REVOKE 
ALL 
ON TABLE customer, dwdate, lineorder, part, supplier
FROM fred;

关于REVOKE的详细信息,参见:REVOKE
https://docs.aws.amazon.com/redshift/latest/dg/r_REVOKE.html

删除本次实验创建的用户
DROP USER IF EXISTS leo, matt, fred;

关于DROP USER的详细信息,参见:DROP USER
https://docs.aws.amazon.com/redshift/latest/dg/r_DROP_USER.html

查看用户(除rdsdb和master)
SELECT * 
FROM pg_user
WHERE usesysid > 100;


References

Result caching



Category: big_data Tags: public

Upvote


Downvote