MPP 환경의 hash partition table에서

잘못된 hash key 선정으로 인해 skew가 발생하는 현황을 조회하려면

다음 쿼리로 조회할 수 있다.


with COL as

(
    select
        tabschema
        , tabname
        , colname
        , colno
        , colcard
        , partkeyseq
        , remarks
    from syscat.columns
    where 1=1
    and partkeyseq > 0
)
select TABINF.tabschema, TABINF.tabname
, max(COL.remarks) as remarks
, substr(xmlcast(xmlgroup(', '||COL.colname as colnm order by COL.partkeyseq) as varchar(128)), 2) as "파티션 키 컬럼"
, max(TBL.remarks) as "컬럼 설명"
--, max(COL.partkeyseq) "파티션 키 순번"
, max(TBL.card) as "테이블 건수"
, max(COL.colcard) as "파티션 키 컬럼 distinct 개수"
, stddev((data_object_p_size+index_object_p_size+long_object_p_size+lob_object_p_size+xml_object_p_size)/1024) as "표준편차_MB"
, stddev(data_object_p_size+index_object_p_size+long_object_p_size+lob_object_p_size+xml_object_p_size) / sum(data_object_p_size+index_object_p_size+long_object_p_size+lob_object_p_size+xml_object_p_size) * 1024 as "사이즈 고려 표준편차지수"
, sum((data_object_p_size+index_object_p_size+long_object_p_size+lob_object_p_size+xml_object_p_size)/1024) as size_mb
, max(decode(dbpartitionnum, 0, data_object_p_size+index_object_p_size+long_object_p_size+lob_object_p_size+xml_object_p_size)) as node0
, max(decode(dbpartitionnum, 1, data_object_p_size+index_object_p_size+long_object_p_size+lob_object_p_size+xml_object_p_size)) as node1
, max(decode(dbpartitionnum, 2, data_object_p_size+index_object_p_size+long_object_p_size+lob_object_p_size+xml_object_p_size)) as node2
, max(decode(dbpartitionnum, 3, data_object_p_size+index_object_p_size+long_object_p_size+lob_object_p_size+xml_object_p_size)) as node3
, max(decode(dbpartitionnum, 4, data_object_p_size+index_object_p_size+long_object_p_size+lob_object_p_size+xml_object_p_size)) as node4
, max(decode(dbpartitionnum, 5, data_object_p_size+index_object_p_size+long_object_p_size+lob_object_p_size+xml_object_p_size)) as node5
, max(decode(dbpartitionnum, 6, data_object_p_size+index_object_p_size+long_object_p_size+lob_object_p_size+xml_object_p_size)) as node6
, max(decode(dbpartitionnum, 7, data_object_p_size+index_object_p_size+long_object_p_size+lob_object_p_size+xml_object_p_size)) as node7
, max(decode(dbpartitionnum, 8, data_object_p_size+index_object_p_size+long_object_p_size+lob_object_p_size+xml_object_p_size)) as node8
, max(decode(dbpartitionnum, 9, data_object_p_size+index_object_p_size+long_object_p_size+lob_object_p_size+xml_object_p_size)) as node9
, max(decode(dbpartitionnum, 10, data_object_p_size+index_object_p_size+long_object_p_size+lob_object_p_size+xml_object_p_size)) as node10
, max(decode(dbpartitionnum, 11, data_object_p_size+index_object_p_size+long_object_p_size+lob_object_p_size+xml_object_p_size)) as node11
, max(decode(dbpartitionnum, 12, data_object_p_size+index_object_p_size+long_object_p_size+lob_object_p_size+xml_object_p_size)) as node12
, max(decode(dbpartitionnum, 13, data_object_p_size+index_object_p_size+long_object_p_size+lob_object_p_size+xml_object_p_size)) as node13
, max(decode(dbpartitionnum, 14, data_object_p_size+index_object_p_size+long_object_p_size+lob_object_p_size+xml_object_p_size)) as node14
, max(decode(dbpartitionnum, 15, data_object_p_size+index_object_p_size+long_object_p_size+lob_object_p_size+xml_object_p_size)) as node15
, max(decode(dbpartitionnum, 16, data_object_p_size+index_object_p_size+long_object_p_size+lob_object_p_size+xml_object_p_size)) as node16
, max(decode(dbpartitionnum, 17, data_object_p_size+index_object_p_size+long_object_p_size+lob_object_p_size+xml_object_p_size)) as node17
, max(decode(dbpartitionnum, 18, data_object_p_size+index_object_p_size+long_object_p_size+lob_object_p_size+xml_object_p_size)) as node18
, max(decode(dbpartitionnum, 19, data_object_p_size+index_object_p_size+long_object_p_size+lob_object_p_size+xml_object_p_size)) as node19
, max(decode(dbpartitionnum, 20, data_object_p_size+index_object_p_size+long_object_p_size+lob_object_p_size+xml_object_p_size)) as node20
, max(decode(dbpartitionnum, 21, data_object_p_size+index_object_p_size+long_object_p_size+lob_object_p_size+xml_object_p_size)) as node21
, max(decode(dbpartitionnum, 22, data_object_p_size+index_object_p_size+long_object_p_size+lob_object_p_size+xml_object_p_size)) as node22
, max(decode(dbpartitionnum, 23, data_object_p_size+index_object_p_size+long_object_p_size+lob_object_p_size+xml_object_p_size)) as node23
, max(decode(dbpartitionnum, 24, data_object_p_size+index_object_p_size+long_object_p_size+lob_object_p_size+xml_object_p_size)) as node24
, max(decode(dbpartitionnum, 25, data_object_p_size+index_object_p_size+long_object_p_size+lob_object_p_size+xml_object_p_size)) as node25
, max(decode(dbpartitionnum, 26, data_object_p_size+index_object_p_size+long_object_p_size+lob_object_p_size+xml_object_p_size)) as node26
, max(decode(dbpartitionnum, 27, data_object_p_size+index_object_p_size+long_object_p_size+lob_object_p_size+xml_object_p_size)) as node27
, max(decode(dbpartitionnum, 28, data_object_p_size+index_object_p_size+long_object_p_size+lob_object_p_size+xml_object_p_size)) as node28
, max(TBL.lastused) as "테이블 마지막 사용일"
from SYSIBMADM.ADMINTABINFO TABINF
right join COL on TABINF.tabschema=COL.tabschema and TABINF.tabname=COL.tabname
right join syscat.tables TBL on TABINF.tabschema=TBL.tabschema and TABINF.tabname=TBL.tabname
where TABINF.tabtype='T'
group by TABINF.tabschema, TABINF.tabname
order by TABINF.tabschema, TABINF.tabname
;


hash key를 여러 칼럼으로 구성한 경우에 대해서는 결과가 제대로 안나올 수 있다.

수행 결과는 다음과 같으며


노드 간 데이터 분산 표준편차는 노드별로 고르지 않은 데이터 분포를 나타내는 값으로 크면 클 수록 노드별 분산이 잘 안되어있다는 의미이고


skew로 인해 발생하는 수행 속도 이슈나 용량 이슈는 테이블이 클 수록 영향이 크므로,

테이블의 용량 가중치를 준 '사이즈 대비 표준편차' 를 기준으로 hash key 재지정 대상을 선정한 후

업무적으로 변경이 가능한 지를 판단하면 되겠다.

Posted by in0de
,