#!/bin/bash ################################ # PARAMETERS FOR DATA GENERATOR ################################ RUNS=10 # table should be 1GB NUM_PAGES=131072 # table with a single int column ROWS_PER_RANGE=226 # what number of pages should contain the MATCHING_PAGES="1 1000 10000" # result name RESULT=$1 ############################# # PARAMETERS FOR INDEX BUILD ############################# # BRIN range size PAGES_PER_RANGE="1 4 8 16 32 64 128 256" # bloom false positive FALSE_POSITIVES="0.01 0.025 0.05 0.075 0.1 0.15" FILL_FACTORS="100 50 150" DATE=$(date +%Y%m%d-%H%M%S) echo "run type match table_pages rows_per_range matching_pages total_rows modulo_interval pages_per_range fill_factor ndistinct ndistinct_target false_positive_rate index_pages run seed value time removed rows" > results-$RESULT-$DATE.csv psql test -c "drop table if exists t" psql test -c "create table t (a int)" RUN=0 SEED=0 for mp in $MATCHING_PAGES; do psql test -c "truncate t" psql test -c "drop index if exists idx" # total number of rows in the table TOTAL_ROWS=$((NUM_PAGES * ROWS_PER_RANGE)) # how often we need to repeat the values, to get the desired number # of matching pages MODULO_INTERVAL=$((TOTAL_ROWS / mp)) # generate data psql test -c "insert into t select mod(i, $MODULO_INTERVAL) from generate_series(1, $TOTAL_ROWS) s(i)" for ppr in $PAGES_PER_RANGE; do NDISTINCT=$((ppr * ROWS_PER_RANGE)) for ff in $FILL_FACTORS; do # used to size the bloom filter (too small / too large) NDISTINCT_TARGET=$((NDISTINCT * 100 / ff)) for fpr in $FALSE_POSITIVES; do if [ -f "stop" ]; then rm "stop" exit fi SEED=$((SEED+1000)) psql test -c "drop index if exists idx" psql test -c "create index idx on t using brin (a int4_bloom_ops(n_distinct_per_range = $NDISTINCT_TARGET, false_positive_rate = $fpr, sort_mode = false)) with (pages_per_range = $ppr)" psql test -c "vacuum analyze t" idx_pages=`psql test -t -A -c "select relpages from pg_class where relname = 'idx'"` # index not built (filter too large) if [ "$idx_pages" == "" ]; then continue fi # matches for i in `seq 1 $RUNS`; do RUN=$((RUN+1)) v=`./rnd.py $((SEED+i)) 0 $((MODULO_INTERVAL-1))` psql test > plan.log <> plans-$DATE.log cat plan.log >> plans-$DATE.log time=`cat plan.log | grep '^Time' | awk '{print $2}'` removed=`cat plan.log | grep 'Rows Removed by Index Recheck' | awk '{print $6}'` if [ "$removed" == "" ]; then removed="0" fi rows=`psql test -t -A -c "select count(*) from t where a = $v"` echo "$RUN int4 match $NUM_PAGES $ROWS_PER_RANGE $mp $TOTAL_ROWS $MODULO_INTERVAL $ppr $ff $NDISTINCT $NDISTINCT_TARGET $fpr $idx_pages $i $SEED $v $time $removed $rows" >> results-$RESULT-$DATE.csv done # no matches (outside modulo interval) for i in `seq 1 $RUNS`; do RUN=$((RUN+1)) v=`./rnd.py $((SEED+i)) $MODULO_INTERVAL $((MODULO_INTERVAL*2))` psql test > plan.log <> plans-$DATE.log cat plan.log >> plans-$DATE.log time=`cat plan.log | grep '^Time' | awk '{print $2}'` removed=`cat plan.log | grep 'Rows Removed by Index Recheck' | awk '{print $6}'` if [ "$removed" == "" ]; then removed="0" fi rows=`psql test -t -A -c "select count(*) from t where a = $v"` echo "$RUN int4 mismatch $NUM_PAGES $ROWS_PER_RANGE $mp $TOTAL_ROWS $MODULO_INTERVAL $ppr $ff $NDISTINCT $NDISTINCT_TARGET $fpr $idx_pages $i $SEED $v $time $removed $rows" >> results-$RESULT-$DATE.csv done done done done done # md5 hashes (32B each, no toasting) ROWS_PER_RANGE=120 psql test -c "drop table if exists t" psql test -c "create table t (a text)" for mp in $MATCHING_PAGES; do psql test -c "truncate t" psql test -c "drop index if exists idx" # total number of rows in the table TOTAL_ROWS=$((NUM_PAGES * ROWS_PER_RANGE)) # how often we need to repeat the values, to get the desired number # of matching pages MODULO_INTERVAL=$((TOTAL_ROWS / mp)) # generate data psql test -c "insert into t select md5(mod(i, $MODULO_INTERVAL)::text) from generate_series(1, $TOTAL_ROWS) s(i)" for ppr in $PAGES_PER_RANGE; do NDISTINCT=$((ppr * ROWS_PER_RANGE)) for ff in $FILL_FACTORS; do # used to size the bloom filter (too small / too large) NDISTINCT_TARGET=$((NDISTINCT * 100 / ff)) for fpr in $FALSE_POSITIVES; do if [ -f "stop" ]; then rm "stop" exit fi SEED=$((SEED+1000)) psql test -c "drop index if exists idx" psql test -c "create index idx on t using brin (a text_bloom_ops(n_distinct_per_range = $NDISTINCT_TARGET, false_positive_rate = $fpr, sort_mode = false)) with (pages_per_range = $ppr)" psql test -c "vacuum analyze t" idx_pages=`psql test -t -A -c "select relpages from pg_class where relname = 'idx'"` # index not built (filter too large) if [ "$idx_pages" == "" ]; then continue fi # matches for i in `seq 1 $RUNS`; do RUN=$((RUN+1)) v=`./rnd.py $((SEED+i)) 0 $((MODULO_INTERVAL-1))` psql test > plan.log <> plans-$DATE.log cat plan.log >> plans-$DATE.log time=`cat plan.log | grep '^Time' | awk '{print $2}'` removed=`cat plan.log | grep 'Rows Removed by Index Recheck' | awk '{print $6}'` if [ "$removed" == "" ]; then removed="0" fi rows=`psql test -t -A -c "select count(*) from t where a = md5($v::text)"` echo "$RUN text match $NUM_PAGES $ROWS_PER_RANGE $mp $TOTAL_ROWS $MODULO_INTERVAL $ppr $ff $NDISTINCT $NDISTINCT_TARGET $fpr $idx_pages $i $SEED $v $time $removed $rows" >> results-$RESULT-$DATE.csv done # no matches (outside modulo interval) for i in `seq 1 $RUNS`; do RUN=$((RUN+1)) v=`./rnd.py $((SEED+i)) $MODULO_INTERVAL $((MODULO_INTERVAL*2))` psql test > plan.log <> plans-$DATE.log cat plan.log >> plans-$DATE.log time=`cat plan.log | grep '^Time' | awk '{print $2}'` removed=`cat plan.log | grep 'Rows Removed by Index Recheck' | awk '{print $6}'` if [ "$removed" == "" ]; then removed="0" fi rows=`psql test -t -A -c "select count(*) from t where a = md5($v::text)"` echo "$RUN text mismatch $NUM_PAGES $ROWS_PER_RANGE $mp $TOTAL_ROWS $MODULO_INTERVAL $ppr $ff $NDISTINCT $NDISTINCT_TARGET $fpr $idx_pages $i $SEED $v $time $removed $rows" >> results-$RESULT-$DATE.csv done done done done done