如何快速估算pg表行数-CFANZ编程社区

一、统计信息 pg_class.reltuples

最简便的方法是利用pg_class.reltuples，类似oracle的num_rows

postgres=# select reltuples::numeric from pg_class where relname='pgbench_accounts';
  reltuples 
 -----------
   20000000
 (1 row)

加::numeric是为了防止数字太大，变成科学计数法

postgres=# select reltuples from pg_class where relname='pgbench_accounts';
  reltuples 
 -----------
      2e+07
 (1 row)

但是这个字段的值需要收集统计信息后才有，如果统计信息过旧，也会不准确

create table tmp001(aid integer) WITH (autovacuum_enabled = off);
 insert into tmp001 select aid from pgbench_accounts;
 select reltuples::numeric from pg_class where relname='tmp001';
  reltuples 
 -----------
          0
 (1 row)

二、执行计划 rows

如果没有统计信息或者比较旧了，又不想收集，可以使用explain

1. 用法测试

postgres=# EXPLAIN SELECT 1 FROM tmp001 limit 1;
                                QUERY PLAN                               
 ------------------------------------------------------------------------
  Limit  (cost=0.00..0.01 rows=1 width=4)
    ->  Seq Scan on tmp001  (cost=0.00..314160.80 rows=22566480 width=4)
 (2 rows)

看到在完全没有统计信息的情况下，偏差大概在10%左右

收集之后，偏差明显减少

postgres=# analyze tmp001;
 ANALYZEpostgres=# EXPLAIN SELECT 1 FROM tmp001 limit 1;
                                QUERY PLAN                               
 ------------------------------------------------------------------------
  Limit  (cost=0.00..0.01 rows=1 width=4)
    ->  Seq Scan on tmp001  (cost=0.00..288496.96 rows=20000096 width=4)
 (2 rows)

但是注意不要EXPLAIN SELECT count(*)，相差很大

postgres=# EXPLAIN SELECT count(*) FROM tmp001;
                                          QUERY PLAN                                         
 --------------------------------------------------------------------------------------------
  Finalize Aggregate  (cost=193663.38..193663.39 rows=1 width=8)
    ->  Gather  (cost=193663.17..193663.38 rows=2 width=8)
          Workers Planned: 2
          ->  Partial Aggregate  (cost=192663.17..192663.18 rows=1 width=8)
                ->  Parallel Seq Scan on tmp001  (cost=0.00..171829.73 rows=8333373 width=0)
 (5 rows)

为了方便获取预估值，可以将执行计划输出转为json格式

postgres=# EXPLAIN (FORMAT JSON) SELECT 1 FROM tmp001 limit 1;
                 QUERY PLAN                 
 -------------------------------------------
  [                                        +
    {                                      +
      "Plan": {                            +
        "Node Type": "Limit",              +
        "Parallel Aware": false,           +
        "Startup Cost": 0.00,              +
        "Total Cost": 0.01,                +
        "Plan Rows": 1,                    +
        "Plan Width": 4,                   +
        "Plans": [                         +
          {                                +
            "Node Type": "Seq Scan",       +
            "Parent Relationship": "Outer",+
            "Parallel Aware": false,       +
            "Relation Name": "tmp001",     +
            "Alias": "tmp001",             +
            "Startup Cost": 0.00,          +
            "Total Cost": 288496.96,       +
            "Plan Rows": 20000096,         +
            "Plan Width": 4                +
          }                                +
        ]                                  +
      }                                    +
    }                                      +
  ]
 (1 row)

2. 按表名统计

创建函数，将Plan Rows转换成输出：

CREATE OR REPLACE FUNCTION countit(name,name)               
RETURNS float4           
LANGUAGE plpgsql AS          
$$DECLARE                                                              
    v_plan json;                      
BEGIN                            
    EXECUTE format('EXPLAIN (FORMAT JSON) SELECT 1 FROM %I.%I', $1,$2)                                     
        INTO v_plan;                                                                                                    
    RETURN v_plan #>> '{0,Plan,"Plan Rows"}';    
END;  
$$;

postgres=# select countit('public','tmp001')::numeric;
  countit  
 ----------
  20011000
 (1 row)

3. 查询所有表

SELECT  
     relname AS table,  
     CASE WHEN relkind = 'r'  
         THEN reltuples::numeric
         ELSE countit(n.nspname,relname)::numeric
     END AS approximate_count
 FROM  
     pg_catalog.pg_class c  
 JOIN  
     pg_catalog.pg_namespace n ON (c.relkind IN ('r','v') AND c.relnamespace = n.oid)
 ORDER BY 2 DESC;    
     
     
                      table             | approximate_count 
 ---------------------------------------+-------------------
  tmp001                      |          20000000
  test                                  |           1608000
  pgbench_branches                      |              5718

4. 按SQL语句统计

CREATE OR REPLACE FUNCTION countit(text)                    
RETURNS float4           
LANGUAGE plpgsql AS          
$$DECLARE               
    v_plan json;                
BEGIN                      
    EXECUTE 'EXPLAIN (FORMAT JSON) '||$1                                
        INTO v_plan;                                                                       
    RETURN v_plan #>> '{0,Plan,"Plan Rows"}';  
END;  
$$;

用法测试

postgres=# create table t1234(id int, info text);  
CREATE TABLE  

postgres=# insert into t1234 select generate_series(1,1000000),'test';  
INSERT 0 1000000  

postgres=# analyze t1234;  
ANALYZE  

postgres=# select countit('select * from t1234 where id<1000');  
 countit   
---------  
     954  
(1 row)  

postgres=# select countit('select * from t1234 where id between 1 and 1000 or (id between 100000 and 101000)');  
 countit   
---------  
    1931  
(1 row)

参考
https://github.com/digoal/blog/blob/master/201509/20150919_02.md

待解决疑问：