OracleのSAMPLE句によるランダムセレクションのばらつきを調べる

SQLアンチパターンを読んでいるのだけど、第15章　ランダムセレクションでは文字通りランダム抽出についての章がある。その章で、ベンダー依存の解決策という断り書きがされた上で、OracleのSAMPLE句を使用したやり方が紹介されている。

Oracleでは（中略）SAMPLE句が使用できます。例えば、テーブル全行の1パーセントを返すクエリを書けます。
SELECT * FROM (SELECT * FROM Bugs SAMPLE (1)
ORDER BY dbms_random.value) WHERE ROWNUM = 1;
SQLアンチパターン 15章ランダムセレクション 15.5.5 ベンダー依存の解決策　より抜粋

このエントリではSAMPLE句を試し、どのような動きをするかを確かめる。

環境

準備

下記のテーブルにデータを入れる。

drop table hoge purge;
create table hoge 
(
  hoge_id integer not null,
  hoge_value varchar2(16),
  constraint hoge_pk primary key (hoge_id)
);

データを3万件入れる。主キーは1オリジンの連番。

insert into hoge
select 
  rownum,
  dbms_random.string('X', 16)
from
  (select rownum from all_catalog where rownum <= 1000),
  (select rownum from all_catalog where rownum <= 30);
commit;

ランダムセレクションは下記のクエリを実行する。SAMPLE句の0.005てのは、30000の0.005パーセントで1.5件になる。実際には0〜4件くらいが返ってくるのでrownumで1件に絞ってしまっている。後述するけど、0件になったときはリトライすることにした。

select hoge_id, hoge_value from hoge sample(0.005) where rownum <= 1;

計測用のプログラム。合計1000万回ランダムセレクションのクエリを発行して、1〜30000の範囲の主キー値が何回出現するかの回数を数える。あと、100万回刻みのインターバルでも結果を取得する……のだが、余り意味が無かった点については後述。

package oraclesample;

import java.io.IOException;
import java.io.PrintStream;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.Arrays;

public class Main {

    public static void main(String[] args) throws IOException {
        int[] map = new int[30001];
        Arrays.fill(map, 0);
        
        final String sqlStr = "select hoge_id, hoge_value from hoge sample(0.005) where rownum <= 1";
        try (Connection connection = DriverManager.getConnection(
                "jdbc:oracle:thin:@192.168.0.20:1521:XE", "kagamihoge", "xxxx");
                PreparedStatement sql = connection.prepareStatement(sqlStr);) {
            
            final int MAX      = 10_000_000;
            final int INTERVAL =  1_000_000;
            for (int i=0; i<MAX; i++) {
                map[getSampleId(sql)]++;
                
                if (i % INTERVAL == 0) {
                    writeToFile(map, i);
                }
            }
            writeToFile(map, MAX);
        } catch (SQLException e) {
            e.printStackTrace();
        }
    }
    
    private static int getSampleId(PreparedStatement sql) throws SQLException {
        boolean isRetry = true;
        int hoge_id = -1;
        
        while (isRetry) {
            try (ResultSet r = sql.executeQuery();) {
                if (!r.next()) {
                    continue;
                }
                hoge_id = r.getInt(1);
                break;
            }
        }
        return hoge_id;
    }

    public static void writeToFile(int[] map, int count) throws IOException {
        System.setOut(new PrintStream(Files.newOutputStream(Paths.get("hoge"+count+".txt"))));
        for (int i=1; i<map.length; i++) {
            System.out.println(map[i]);
        }
    }
}