Skewed Data

  • Oracle은 값에 따른 Data 분포의 편차가 큰 경우 잘 해석하지 못함.
  • Skew 문제를 해결할 수 있는 유일한 방법은 Histogram
  • 하지만 Histogram으로도 100% 해결할 수 없는 Case가 있음.

Test

  • Skewness가 높은 경우 오라클이 Cardinality를 어떻게 계산하는지 알아본다.
  • 통계 정보를 생성하되 Histogram은 만들지 않는다.

CREATE TABLE T1(C1 VARCHAR2(1), C2 INT);

CREATE INDEX T1_N1 ON T1(C1);

-- create data
INSERT INTO T1
SELECT 'A',
       LEVEL
FROM   DUAL
CONNECT BY LEVEL <= 10000
UNION ALL
SELECT 'B', LEVEL
FROM   DUAL
CONNECT BY LEVEL <= 10
;

COMMIT;

-- gather stats "without" histogram
EXEC DBMS_STATS.GATHER_TABLE_STATS(USER, 'T1', METHOD_OPT => 'FOR ALL COLUMNS SIZE 1', NO_INVALIDATE => FALSE);

  • 테이블 'T1'에 대한 정보를 다음과 같이 알 수 있다.

-- Table
SELECT TABLE_NAME,
       NUM_ROWS,
       BLOCKS,
       SAMPLE_SIZE,
       TO_CHAR(LAST_ANALYZED, 'YYYY/MM/DD HH24:MI:SS') AS LAST_ANAL
FROM   USER_TAB_STATISTICS
WHERE  TABLE_NAME = UPPER('T1')
;

TABLE_NAME                       NUM_ROWS     BLOCKS SAMPLE_SIZE   LAST_ANAL
------------------------------ ---------- ---------- ------------- -----------------
T1                                  10010         42       10010   2011/05/12 11:53:04


  • 테이블 'T1'에 대한 정보도 다음과 같이 알 수 있다.

-- Column
SELECT S.TABLE_NAME,
       S.COLUMN_NAME,
       S.NUM_DISTINCT,
       S.NUM_NULLS,
       S.DENSITY,
       S.LOW_VALUE,
       S.HIGH_VALUE,
       S.HISTOGRAM
FROM   USER_TAB_COLS S
WHERE  S.TABLE_NAME = UPPER('T1')
;

TABLE_NAME	COLUMN_NAME	NUM_DISTINCT	NUM_NULLS	DENSITY	LOW_VALUE	HIGH_VALUE HISTOGRAM
	
T1	        C1	        2	        0	        0.5	41	        42	   NONE
T1	        C2	        10000	        0	        0.0001	C102	        C302	   NONE


  • 테이블 'T1'에 대한 Histogram 정보를 알 수 있다.

-- Histogram
SELECT TABLE_NAME,
       COLUMN_NAME,
       ENDPOINT_NUMBER,
       ENDPOINT_VALUE || '(' || ENDPOINT_ACTUAL_VALUE || ')' AS ENDPOINT_VALUE
FROM   USER_TAB_HISTOGRAMS
WHERE  TABLE_NAME = UPPER('T1')
ORDER  BY COLUMN_NAME,
          ENDPOINT_NUMBER
;

TABLE_NAME	COLUMN_NAME	ENDPOINT_NUMBER	ENDPOINT_VALUE

T1	        C1	        0	        337499295804764000000000000000000000()
T1	        C1	        1	        342691592663299000000000000000000000()
T1	        C2	        0	        1()
T1	        C2	        1	        10000()
         

  • 아래의 두 경우를 비교하면 실제의 데이터 Row의 수는 100000건과 10건이지만 E-Rows는 같은 것을 알 수가 있다.
  • Density가 1/NDV = 1/2 = 0.5이므로 Cardinality = 10010 * 0.5 = 5005가 된다.
  • 이것은 C1 조건 'B'에도 같이 해당된다.

SELECT /*+ GATHER_PLAN_STATISTICS */
       COUNT(*)
FROM   T1
WHERE  C1 = 'A'
;

select * from table(dbms_xplan.display_cursor(null, null, 'iostats cost last'));

Plan hash value: 111057421

PLAN_TABLE_OUTPUT

------------------------------------------------------------------------------------------------------
| Id  | Operation             | Name  | Starts | E-Rows | Cost (%CPU)| A-Rows |   A-Time   | Buffers |
------------------------------------------------------------------------------------------------------
|   1 |  SORT AGGREGATE       |       |      1 |      1 |            |      1 |00:00:00.01 |      49 |
|*  2 |   INDEX FAST FULL SCAN| T1_N1 |      1 |   5005 |     8   (0)|  10000 |00:00:00.01 |      49 |
------------------------------------------------------------------------------------------------------
 
Predicate Information (identified by operation id):
---------------------------------------------------
 
   2 - filter("C1"='A')




SELECT /*+ GATHER_PLAN_STATISTICS */
       COUNT(*)
FROM   T1
WHERE  C1 = 'B'
;

select * from table(dbms_xplan.display_cursor(null, null, 'iostats cost last'));
PLAN_TABLE_OUTPUT

Plan hash value: 111057421
 
------------------------------------------------------------------------------------------------------
| Id  | Operation             | Name  | Starts | E-Rows | Cost (%CPU)| A-Rows |   A-Time   | Buffers |
------------------------------------------------------------------------------------------------------
|   1 |  SORT AGGREGATE       |       |      1 |      1 |            |      1 |00:00:00.01 |      44 |
|*  2 |   INDEX FAST FULL SCAN| T1_N1 |      1 |   5005 |     8   (0)|     10 |00:00:00.01 |      44 |
------------------------------------------------------------------------------------------------------
 
Predicate Information (identified by operation id):
---------------------------------------------------
 
   2 - filter("C1"='B')
 


  • SKEWONLY 옵션을 이용하여 Histogram을 생성한다.
  • 컬럼의 정보가 변한 것을 알 수 있다.

EXEC DBMS_STATS.GATHER_TABLE_STATS(USER, 'T1', METHOD_OPT => 'FOR ALL COLUMNS SIZE SKEWONLY', NO_INVALIDATE => FALSE);

-- Histogram
SELECT S.TABLE_NAME,
       S.COLUMN_NAME,
       S.NUM_DISTINCT,
       S.NUM_NULLS,
       S.DENSITY,
       S.LOW_VALUE,
       S.HIGH_VALUE,
       S.HISTOGRAM
FROM   USER_TAB_COLS S
WHERE  S.TABLE_NAME = UPPER('T1')
;

TABLE_NAME COLUMN_NAME	NUM_DISTINCT NUM_NULLS	DENSITY	           LOW_VALUE HIGH_VALUE  HISTOGRAM
T1	   C1	        2	     0	        4.995004995005E-5	   41	     42	        FREQUENCY
T1	   C2	        10000	     0	        0.0001	           C102	     C302	NONE


  • 정확한 Cardinality를 알 수 있다.

SELECT /*+ GATHER_PLAN_STATISTICS */
       COUNT(*)
FROM   T1
WHERE  C1 = 'A'
;

select * from table(dbms_xplan.display_cursor(null, null, 'iostats cost last'));

PLAN_TABLE_OUTPUT

Plan hash value: 111057421
 
------------------------------------------------------------------------------------------------------
| Id  | Operation             | Name  | Starts | E-Rows | Cost (%CPU)| A-Rows |   A-Time   | Buffers |
------------------------------------------------------------------------------------------------------
|   1 |  SORT AGGREGATE       |       |      1 |      1 |            |      1 |00:00:00.01 |      44 |
|*  2 |   INDEX FAST FULL SCAN| T1_N1 |      1 |  10000 |     8   (0)|  10000 |00:00:00.01 |      44 |
------------------------------------------------------------------------------------------------------
 
Predicate Information (identified by operation id):
---------------------------------------------------
 
   2 - filter("C1"='A')
 



SELECT /*+ GATHER_PLAN_STATISTICS */
       COUNT(*)
FROM   T1
WHERE  C1 = 'B'
;

select * from table(dbms_xplan.display_cursor(null, null, 'iostats cost last'));

PLAN_TABLE_OUTPUT

Plan hash value: 3678027643
 
--------------------------------------------------------------------------------------------------
| Id  | Operation         | Name  | Starts | E-Rows | Cost (%CPU)| A-Rows |   A-Time   | Buffers |
--------------------------------------------------------------------------------------------------
|   1 |  SORT AGGREGATE   |       |      1 |      1 |            |      1 |00:00:00.01 |       2 |
|*  2 |   INDEX RANGE SCAN| T1_N1 |      1 |     10 |     1   (0)|     10 |00:00:00.01 |       2 |
--------------------------------------------------------------------------------------------------
 
Predicate Information (identified by operation id):
---------------------------------------------------
 
   2 - access("C1"='B')