Skewed Data
- Oracle은 값에 따른 Data 분포의 편차가 큰 경우 잘 해석하지 못함.
- Skew 문제를 해결할 수 있는 유일한 방법은 Histogram
- 하지만 Histogram으로도 100% 해결할 수 없는 Case가 있음.
Test
- Skewness가 높은 경우 오라클이 Cardinality를 어떻게 계산하는지 알아본다.
- 통계 정보를 생성하되 Histogram은 만들지 않는다.
CREATE TABLE T1(C1 VARCHAR2(1), C2 INT);
CREATE INDEX T1_N1 ON T1(C1);
-- create data
INSERT INTO T1
SELECT 'A',
LEVEL
FROM DUAL
CONNECT BY LEVEL <= 10000
UNION ALL
SELECT 'B', LEVEL
FROM DUAL
CONNECT BY LEVEL <= 10
;
COMMIT;
-- gather stats "without" histogram
EXEC DBMS_STATS.GATHER_TABLE_STATS(USER, 'T1', METHOD_OPT => 'FOR ALL COLUMNS SIZE 1', NO_INVALIDATE => FALSE);
- 테이블 'T1'에 대한 정보를 다음과 같이 알 수 있다.
-- Table
SELECT TABLE_NAME,
NUM_ROWS,
BLOCKS,
SAMPLE_SIZE,
TO_CHAR(LAST_ANALYZED, 'YYYY/MM/DD HH24:MI:SS') AS LAST_ANAL
FROM USER_TAB_STATISTICS
WHERE TABLE_NAME = UPPER('T1')
;
TABLE_NAME NUM_ROWS BLOCKS SAMPLE_SIZE LAST_ANAL
------------------------------ ---------- ---------- ------------- -----------------
T1 10010 42 10010 2011/05/12 11:53:04
- 테이블 'T1'에 대한 정보도 다음과 같이 알 수 있다.
-- Column
SELECT S.TABLE_NAME,
S.COLUMN_NAME,
S.NUM_DISTINCT,
S.NUM_NULLS,
S.DENSITY,
S.LOW_VALUE,
S.HIGH_VALUE,
S.HISTOGRAM
FROM USER_TAB_COLS S
WHERE S.TABLE_NAME = UPPER('T1')
;
TABLE_NAME COLUMN_NAME NUM_DISTINCT NUM_NULLS DENSITY LOW_VALUE HIGH_VALUE HISTOGRAM
T1 C1 2 0 0.5 41 42 NONE
T1 C2 10000 0 0.0001 C102 C302 NONE
- 테이블 'T1'에 대한 Histogram 정보를 알 수 있다.
-- Histogram
SELECT TABLE_NAME,
COLUMN_NAME,
ENDPOINT_NUMBER,
ENDPOINT_VALUE || '(' || ENDPOINT_ACTUAL_VALUE || ')' AS ENDPOINT_VALUE
FROM USER_TAB_HISTOGRAMS
WHERE TABLE_NAME = UPPER('T1')
ORDER BY COLUMN_NAME,
ENDPOINT_NUMBER
;
TABLE_NAME COLUMN_NAME ENDPOINT_NUMBER ENDPOINT_VALUE
T1 C1 0 337499295804764000000000000000000000()
T1 C1 1 342691592663299000000000000000000000()
T1 C2 0 1()
T1 C2 1 10000()
- 아래의 두 경우를 비교하면 실제의 데이터 Row의 수는 100000건과 10건이지만 E-Rows는 같은 것을 알 수가 있다.
- Density가 1/NDV = 1/2 = 0.5이므로 Cardinality = 10010 * 0.5 = 5005가 된다.
- 이것은 C1 조건 'B'에도 같이 해당된다.
SELECT /*+ GATHER_PLAN_STATISTICS */
COUNT(*)
FROM T1
WHERE C1 = 'A'
;
select * from table(dbms_xplan.display_cursor(null, null, 'iostats cost last'));
Plan hash value: 111057421
PLAN_TABLE_OUTPUT
------------------------------------------------------------------------------------------------------
| Id | Operation | Name | Starts | E-Rows | Cost (%CPU)| A-Rows | A-Time | Buffers |
------------------------------------------------------------------------------------------------------
| 1 | SORT AGGREGATE | | 1 | 1 | | 1 |00:00:00.01 | 49 |
|* 2 | INDEX FAST FULL SCAN| T1_N1 | 1 | 5005 | 8 (0)| 10000 |00:00:00.01 | 49 |
------------------------------------------------------------------------------------------------------
Predicate Information (identified by operation id):
---------------------------------------------------
2 - filter("C1"='A')
SELECT /*+ GATHER_PLAN_STATISTICS */
COUNT(*)
FROM T1
WHERE C1 = 'B'
;
select * from table(dbms_xplan.display_cursor(null, null, 'iostats cost last'));
PLAN_TABLE_OUTPUT
Plan hash value: 111057421
------------------------------------------------------------------------------------------------------
| Id | Operation | Name | Starts | E-Rows | Cost (%CPU)| A-Rows | A-Time | Buffers |
------------------------------------------------------------------------------------------------------
| 1 | SORT AGGREGATE | | 1 | 1 | | 1 |00:00:00.01 | 44 |
|* 2 | INDEX FAST FULL SCAN| T1_N1 | 1 | 5005 | 8 (0)| 10 |00:00:00.01 | 44 |
------------------------------------------------------------------------------------------------------
Predicate Information (identified by operation id):
---------------------------------------------------
2 - filter("C1"='B')
- SKEWONLY 옵션을 이용하여 Histogram을 생성한다.
- 컬럼의 정보가 변한 것을 알 수 있다.
EXEC DBMS_STATS.GATHER_TABLE_STATS(USER, 'T1', METHOD_OPT => 'FOR ALL COLUMNS SIZE SKEWONLY', NO_INVALIDATE => FALSE);
-- Histogram
SELECT S.TABLE_NAME,
S.COLUMN_NAME,
S.NUM_DISTINCT,
S.NUM_NULLS,
S.DENSITY,
S.LOW_VALUE,
S.HIGH_VALUE,
S.HISTOGRAM
FROM USER_TAB_COLS S
WHERE S.TABLE_NAME = UPPER('T1')
;
TABLE_NAME COLUMN_NAME NUM_DISTINCT NUM_NULLS DENSITY LOW_VALUE HIGH_VALUE HISTOGRAM
T1 C1 2 0 4.995004995005E-5 41 42 FREQUENCY
T1 C2 10000 0 0.0001 C102 C302 NONE
SELECT /*+ GATHER_PLAN_STATISTICS */
COUNT(*)
FROM T1
WHERE C1 = 'A'
;
select * from table(dbms_xplan.display_cursor(null, null, 'iostats cost last'));
PLAN_TABLE_OUTPUT
Plan hash value: 111057421
------------------------------------------------------------------------------------------------------
| Id | Operation | Name | Starts | E-Rows | Cost (%CPU)| A-Rows | A-Time | Buffers |
------------------------------------------------------------------------------------------------------
| 1 | SORT AGGREGATE | | 1 | 1 | | 1 |00:00:00.01 | 44 |
|* 2 | INDEX FAST FULL SCAN| T1_N1 | 1 | 10000 | 8 (0)| 10000 |00:00:00.01 | 44 |
------------------------------------------------------------------------------------------------------
Predicate Information (identified by operation id):
---------------------------------------------------
2 - filter("C1"='A')
SELECT /*+ GATHER_PLAN_STATISTICS */
COUNT(*)
FROM T1
WHERE C1 = 'B'
;
select * from table(dbms_xplan.display_cursor(null, null, 'iostats cost last'));
PLAN_TABLE_OUTPUT
Plan hash value: 3678027643
--------------------------------------------------------------------------------------------------
| Id | Operation | Name | Starts | E-Rows | Cost (%CPU)| A-Rows | A-Time | Buffers |
--------------------------------------------------------------------------------------------------
| 1 | SORT AGGREGATE | | 1 | 1 | | 1 |00:00:00.01 | 2 |
|* 2 | INDEX RANGE SCAN| T1_N1 | 1 | 10 | 1 (0)| 10 |00:00:00.01 | 2 |
--------------------------------------------------------------------------------------------------
Predicate Information (identified by operation id):
---------------------------------------------------
2 - access("C1"='B')