|
1 | 1 | --
|
2 | 2 | -- RANDOM
|
3 |
| --- Test the random function |
| 3 | +-- Test random() and allies |
4 | 4 | --
|
5 |
| --- count the number of tuples originally, should be 1000 |
6 |
| -SELECT count(*) FROM onek; |
7 |
| - count |
8 |
| -------- |
9 |
| - 1000 |
10 |
| -(1 row) |
11 |
| - |
12 |
| --- pick three random rows, they shouldn't match |
13 |
| -(SELECT unique1 AS random |
14 |
| - FROM onek ORDER BY random() LIMIT 1) |
15 |
| -INTERSECT |
16 |
| -(SELECT unique1 AS random |
17 |
| - FROM onek ORDER BY random() LIMIT 1) |
18 |
| -INTERSECT |
19 |
| -(SELECT unique1 AS random |
20 |
| - FROM onek ORDER BY random() LIMIT 1); |
21 |
| - random |
22 |
| --------- |
| 5 | +-- Tests in this file may have a small probability of failure, |
| 6 | +-- since we are dealing with randomness. Try to keep the failure |
| 7 | +-- risk for any one test case under 1e-9. |
| 8 | +-- |
| 9 | +-- There should be no duplicates in 1000 random() values. |
| 10 | +-- (Assuming 52 random bits in the float8 results, we could |
| 11 | +-- take as many as 3000 values and still have less than 1e-9 chance |
| 12 | +-- of failure, per https://en.wikipedia.org/wiki/Birthday_problem) |
| 13 | +SELECT r, count(*) |
| 14 | +FROM (SELECT random() r FROM generate_series(1, 1000)) ss |
| 15 | +GROUP BY r HAVING count(*) > 1; |
| 16 | + r | count |
| 17 | +---+------- |
23 | 18 | (0 rows)
|
24 | 19 |
|
25 |
| --- count roughly 1/10 of the tuples |
26 |
| -CREATE TABLE RANDOM_TBL AS |
27 |
| - SELECT count(*) AS random |
28 |
| - FROM onek WHERE random() < 1.0/10; |
29 |
| --- select again, the count should be different |
30 |
| -INSERT INTO RANDOM_TBL (random) |
31 |
| - SELECT count(*) |
32 |
| - FROM onek WHERE random() < 1.0/10; |
33 |
| --- select again, the count should be different |
34 |
| -INSERT INTO RANDOM_TBL (random) |
35 |
| - SELECT count(*) |
36 |
| - FROM onek WHERE random() < 1.0/10; |
37 |
| --- select again, the count should be different |
38 |
| -INSERT INTO RANDOM_TBL (random) |
39 |
| - SELECT count(*) |
40 |
| - FROM onek WHERE random() < 1.0/10; |
41 |
| --- now test that they are different counts |
42 |
| -SELECT random, count(random) FROM RANDOM_TBL |
43 |
| - GROUP BY random HAVING count(random) > 3; |
44 |
| - random | count |
45 |
| ---------+------- |
46 |
| -(0 rows) |
| 20 | +-- The range should be [0, 1). We can expect that at least one out of 2000 |
| 21 | +-- random values is in the lowest or highest 1% of the range with failure |
| 22 | +-- probability less than about 1e-9. |
| 23 | +SELECT count(*) FILTER (WHERE r < 0 OR r >= 1) AS out_of_range, |
| 24 | + (count(*) FILTER (WHERE r < 0.01)) > 0 AS has_small, |
| 25 | + (count(*) FILTER (WHERE r > 0.99)) > 0 AS has_large |
| 26 | +FROM (SELECT random() r FROM generate_series(1, 2000)) ss; |
| 27 | + out_of_range | has_small | has_large |
| 28 | +--------------+-----------+----------- |
| 29 | + 0 | t | t |
| 30 | +(1 row) |
47 | 31 |
|
48 |
| -SELECT AVG(random) FROM RANDOM_TBL |
49 |
| - HAVING AVG(random) NOT BETWEEN 80 AND 120; |
50 |
| - avg |
51 |
| ------ |
52 |
| -(0 rows) |
| 32 | +-- Check for uniform distribution using the Kolmogorov-Smirnov test. |
| 33 | +CREATE FUNCTION ks_test_uniform_random() |
| 34 | +RETURNS boolean AS |
| 35 | +$$ |
| 36 | +DECLARE |
| 37 | + n int := 1000; -- Number of samples |
| 38 | + c float8 := 1.94947; -- Critical value for 99.9% confidence |
| 39 | + ok boolean; |
| 40 | +BEGIN |
| 41 | + ok := ( |
| 42 | + WITH samples AS ( |
| 43 | + SELECT random() r FROM generate_series(1, n) ORDER BY 1 |
| 44 | + ), indexed_samples AS ( |
| 45 | + SELECT (row_number() OVER())-1.0 i, r FROM samples |
| 46 | + ) |
| 47 | + SELECT max(abs(i/n-r)) < c / sqrt(n) FROM indexed_samples |
| 48 | + ); |
| 49 | + RETURN ok; |
| 50 | +END |
| 51 | +$$ |
| 52 | +LANGUAGE plpgsql; |
| 53 | +-- As written, ks_test_uniform_random() returns true about 99.9% |
| 54 | +-- of the time. To get down to a roughly 1e-9 test failure rate, |
| 55 | +-- just run it 3 times and accept if any one of them passes. |
| 56 | +SELECT ks_test_uniform_random() OR |
| 57 | + ks_test_uniform_random() OR |
| 58 | + ks_test_uniform_random() AS uniform; |
| 59 | + uniform |
| 60 | +--------- |
| 61 | + t |
| 62 | +(1 row) |
53 | 63 |
|
54 | 64 | -- now test random_normal()
|
55 |
| -TRUNCATE random_tbl; |
56 |
| -INSERT INTO random_tbl (random) |
57 |
| - SELECT count(*) |
58 |
| - FROM onek WHERE random_normal(0, 1) < 0; |
59 |
| -INSERT INTO random_tbl (random) |
60 |
| - SELECT count(*) |
61 |
| - FROM onek WHERE random_normal(0) < 0; |
62 |
| -INSERT INTO random_tbl (random) |
63 |
| - SELECT count(*) |
64 |
| - FROM onek WHERE random_normal() < 0; |
65 |
| -INSERT INTO random_tbl (random) |
66 |
| - SELECT count(*) |
67 |
| - FROM onek WHERE random_normal(stddev => 1, mean => 0) < 0; |
68 |
| --- expect similar, but not identical values |
69 |
| -SELECT random, count(random) FROM random_tbl |
70 |
| - GROUP BY random HAVING count(random) > 3; |
71 |
| - random | count |
72 |
| ---------+------- |
| 65 | +-- As above, there should be no duplicates in 1000 random_normal() values. |
| 66 | +SELECT r, count(*) |
| 67 | +FROM (SELECT random_normal() r FROM generate_series(1, 1000)) ss |
| 68 | +GROUP BY r HAVING count(*) > 1; |
| 69 | + r | count |
| 70 | +---+------- |
73 | 71 | (0 rows)
|
74 | 72 |
|
75 |
| --- approximately check expected distribution |
76 |
| -SELECT AVG(random) FROM random_tbl |
77 |
| - HAVING AVG(random) NOT BETWEEN 400 AND 600; |
78 |
| - avg |
79 |
| ------ |
80 |
| -(0 rows) |
| 73 | +-- ... unless we force the range (standard deviation) to zero. |
| 74 | +-- This is a good place to check that the mean input does something, too. |
| 75 | +SELECT r, count(*) |
| 76 | +FROM (SELECT random_normal(10, 0) r FROM generate_series(1, 100)) ss |
| 77 | +GROUP BY r; |
| 78 | + r | count |
| 79 | +----+------- |
| 80 | + 10 | 100 |
| 81 | +(1 row) |
| 82 | + |
| 83 | +SELECT r, count(*) |
| 84 | +FROM (SELECT random_normal(-10, 0) r FROM generate_series(1, 100)) ss |
| 85 | +GROUP BY r; |
| 86 | + r | count |
| 87 | +-----+------- |
| 88 | + -10 | 100 |
| 89 | +(1 row) |
| 90 | + |
| 91 | +-- setseed() should produce a reproducible series of random() values. |
| 92 | +SELECT setseed(0.5); |
| 93 | + setseed |
| 94 | +--------- |
| 95 | + |
| 96 | +(1 row) |
| 97 | + |
| 98 | +SELECT random() FROM generate_series(1, 10); |
| 99 | + random |
| 100 | +--------------------- |
| 101 | + 0.9851677175347999 |
| 102 | + 0.825301858027981 |
| 103 | + 0.12974610012450416 |
| 104 | + 0.16356291958601088 |
| 105 | + 0.6476186144084 |
| 106 | + 0.8822771983038762 |
| 107 | + 0.1404566845227775 |
| 108 | + 0.15619865764623442 |
| 109 | + 0.5145227426983392 |
| 110 | + 0.7712969548127826 |
| 111 | +(10 rows) |
| 112 | + |
| 113 | +-- Likewise for random_normal(); however, since its implementation relies |
| 114 | +-- on libm functions that have different roundoff behaviors on different |
| 115 | +-- machines, we have to round off the results a bit to get consistent output. |
| 116 | +SET extra_float_digits = 0; |
| 117 | +SELECT random_normal() FROM generate_series(1, 10); |
| 118 | + random_normal |
| 119 | +-------------------- |
| 120 | + 0.208534644938377 |
| 121 | + 0.264530240540963 |
| 122 | + -0.606752467900428 |
| 123 | + 0.825799427852654 |
| 124 | + 1.70111611735357 |
| 125 | + -0.223445463716189 |
| 126 | + 0.249712419190998 |
| 127 | + -1.2494722990669 |
| 128 | + 0.125627152043677 |
| 129 | + 0.475391614544013 |
| 130 | +(10 rows) |
| 131 | + |
| 132 | +SELECT random_normal(mean => 1, stddev => 0.1) r FROM generate_series(1, 10); |
| 133 | + r |
| 134 | +------------------- |
| 135 | + 1.00605972811732 |
| 136 | + 1.09685453015002 |
| 137 | + 1.02869206132007 |
| 138 | + 0.909475676712336 |
| 139 | + 0.983724763134265 |
| 140 | + 0.939344549577623 |
| 141 | + 1.18713500206363 |
| 142 | + 0.962257684292933 |
| 143 | + 0.914441206800407 |
| 144 | + 0.964031055575433 |
| 145 | +(10 rows) |
81 | 146 |
|
0 commit comments