ทำไมดัชนีจินในคอลัมน์ jsonb ทำให้ข้อความค้นหาของฉันช้าลงและฉันจะทำอย่างไรได้บ้าง

เตรียมข้อมูลการทดสอบ:

CREATE EXTENSION IF NOT EXISTS pgcrypto;
CREATE TABLE docs (data JSONB NOT NULL DEFAULT '{}');
-- generate 200k documents, ~half with type: "type1" and another half with type: "type2", unique incremented index and random uuid per each row
INSERT INTO docs (data)
SELECT json_build_object('id', gen_random_uuid(), 'type', (CASE WHEN random() > 0.5 THEN 'type1' ELSE 'type2' END) ,'index', n)::JSONB
FROM generate_series(1, 200000) n;
-- inset one more row with explicit uuid to query by it later
INSERT INTO docs (data) VALUES (json_build_object('id', '30e84646-c5c5-492d-b7f7-c884d77d1e0a', 'type', 'type1' ,'index', 200001)::JSONB);

ข้อความค้นหาแรก - กรองตามข้อมูล -> ประเภทและขีด จำกัด :

-- FAST ~19ms
EXPLAIN ANALYZE
SELECT * FROM docs
WHERE data @> '{"type": "type1"}'::JSONB
LIMIT 25;
/* "Limit  (cost=0.00..697.12 rows=25 width=90) (actual time=0.029..0.070 rows=25 loops=1)"
   "  ->  Seq Scan on docs  (cost=0.00..5577.00 rows=200 width=90) (actual time=0.028..0.061 rows=25 loops=1)"
   "        Filter: (data @> '{"type": "type1"}'::jsonb)"
   "        Rows Removed by Filter: 17"
   "Planning time: 0.069 ms"
   "Execution time: 0.098 ms" 
*/

แบบสอบถามที่สอง - กรองตาม data-> type เรียงลำดับตาม data-> index และ limit

-- SLOW ~250ms
EXPLAIN ANALYZE
SELECT * FROM docs
WHERE data @> '{"type": "type1"}'::JSONB
ORDER BY data->'index' -- added ORDER BY
LIMIT 25;

/* "Limit  (cost=5583.14..5583.21 rows=25 width=90) (actual time=236.750..236.754 rows=25 loops=1)"
   "  ->  Sort  (cost=5583.14..5583.64 rows=200 width=90) (actual time=236.750..236.750 rows=25 loops=1)"
   "        Sort Key: ((data -> 'index'::text))"
   "        Sort Method: top-N heapsort  Memory: 28kB"
   "        ->  Seq Scan on docs  (cost=0.00..5577.50 rows=200 width=90) (actual time=0.020..170.797 rows=100158 loops=1)"
   "              Filter: (data @> '{"type": "type1"}'::jsonb)"
   "              Rows Removed by Filter: 99842"
   "Planning time: 0.075 ms"
   "Execution time: 236.785 ms"
*/

เคียวรีที่สาม - เหมือนกับที่สอง (ก่อนหน้า) แต่มีดัชนี btree ใน data-> index:

CREATE INDEX docs_data_index_idx ON docs ((data->'index'));

-- FAST ~19ms
EXPLAIN ANALYZE
SELECT * FROM docs
WHERE data @> '{"type": "type1"}'::JSONB
ORDER BY data->'index' -- added BTREE index on this field
LIMIT 25;
/* "Limit  (cost=0.42..2473.98 rows=25 width=90) (actual time=0.040..0.125 rows=25 loops=1)"
   "  ->  Index Scan using docs_data_index_idx on docs  (cost=0.42..19788.92 rows=200 width=90) (actual time=0.038..0.119 rows=25 loops=1)"
   "        Filter: (data @> '{"type": "type1"}'::jsonb)"
   "        Rows Removed by Filter: 17"
   "Planning time: 0.127 ms"
   "Execution time: 0.159 ms"
*/

ข้อความค้นหาที่สี่ - ตอนนี้กรองตาม data-> id และ limit = 1:

-- SLOW ~116ms
EXPLAIN ANALYZE
SELECT * FROM docs
WHERE data @> ('{"id": "30e84646-c5c5-492d-b7f7-c884d77d1e0a"}')::JSONB -- querying by "id" field now
LIMIT 1;
/* "Limit  (cost=0.00..27.89 rows=1 width=90) (actual time=97.990..97.990 rows=1 loops=1)"
   "  ->  Seq Scan on docs  (cost=0.00..5577.00 rows=200 width=90) (actual time=97.989..97.989 rows=1 loops=1)"
   "        Filter: (data @> '{"id": "30e84646-c5c5-492d-b7f7-c884d77d1e0a"}'::jsonb)"
   "        Rows Removed by Filter: 189999"
   "Planning time: 0.064 ms"
   "Execution time: 98.012 ms"
*/

ข้อความค้นหาที่ห้า - เหมือนกับสี่ แต่มีดัชนี gin (json_path_ops) กับข้อมูล:

CREATE INDEX docs_data_idx ON docs USING GIN (data jsonb_path_ops);

-- FAST ~17ms
EXPLAIN ANALYZE
SELECT * FROM docs
WHERE data @> '{"id": "30e84646-c5c5-492d-b7f7-c884d77d1e0a"}'::JSONB -- added gin index with json_path_ops
LIMIT 1;
/* "Limit  (cost=17.55..20.71 rows=1 width=90) (actual time=0.027..0.027 rows=1 loops=1)"
   "  ->  Bitmap Heap Scan on docs  (cost=17.55..649.91 rows=200 width=90) (actual time=0.026..0.026 rows=1 loops=1)"
   "        Recheck Cond: (data @> '{"id": "30e84646-c5c5-492d-b7f7-c884d77d1e0a"}'::jsonb)"
   "        Heap Blocks: exact=1"
   "        ->  Bitmap Index Scan on docs_data_idx  (cost=0.00..17.50 rows=200 width=0) (actual time=0.016..0.016 rows=1 loops=1)"
   "              Index Cond: (data @> '{"id": "30e84646-c5c5-492d-b7f7-c884d77d1e0a"}'::jsonb)"
   "Planning time: 0.095 ms"
   "Execution time: 0.055 ms"
*/

การค้นหาที่หก (และสุดท้าย) - เหมือนกับการสืบค้นที่สาม (การสืบค้นตามข้อมูล -> ประเภท, เรียงลำดับตามข้อมูล -> ดัชนี, จำกัด ):

-- SLOW AGAIN! ~224ms
EXPLAIN ANALYZE
SELECT * FROM docs
WHERE data @> '{"type": "type1"}'::JSONB
ORDER BY data->'index'
LIMIT 25;
/* "Limit  (cost=656.06..656.12 rows=25 width=90) (actual time=215.927..215.932 rows=25 loops=1)"
   "  ->  Sort  (cost=656.06..656.56 rows=200 width=90) (actual time=215.925..215.925 rows=25 loops=1)"
   "        Sort Key: ((data -> 'index'::text))"
   "        Sort Method: top-N heapsort  Memory: 28kB"
   "        ->  Bitmap Heap Scan on docs  (cost=17.55..650.41 rows=200 width=90) (actual time=33.134..152.618 rows=100158 loops=1)"
   "              Recheck Cond: (data @> '{"type": "type1"}'::jsonb)"
   "              Heap Blocks: exact=3077"
   "              ->  Bitmap Index Scan on docs_data_idx  (cost=0.00..17.50 rows=200 width=0) (actual time=32.468..32.468 rows=100158 loops=1)"
   "                    Index Cond: (data @> '{"type": "type1"}'::jsonb)"
   "Planning time: 0.157 ms"
   "Execution time: 215.992 ms"
*/

ดังนั้นดูเหมือนว่าแบบสอบถามที่หก (เหมือนกับที่สาม) จะช้ากว่ามากเมื่อมีดัชนีจินในคอลัมน์ข้อมูล อาจเป็นเพราะมีค่าที่แตกต่างกันไม่มากสำหรับเขตข้อมูล data-> type (เฉพาะ "type1" หรือ "type2")? ฉันจะทำอะไรได้บ้าง ฉันต้องการดัชนีจินเพื่อทำเคียวรีอื่นที่มีประโยชน์ ...

postgresql postgresql-9.4

— user606521
แหล่งที่มา

ดูเหมือนว่าคุณได้พบกับปัญหาที่jsonbคอลัมน์มีอัตราสถิติ 1% ตามที่รายงานที่นี่การทำงานเกี่ยวกับการขาดสถิติของ jsonb หรือไม่ . เมื่อดูที่แผนคิวรีของคุณความแตกต่างระหว่างค่าประมาณและการประมวลผลจริงนั้นมีมาก ประมาณการบอกว่าอาจมี 200 แถวและผลตอบแทนจริง 1,158,880 แถวซึ่งทำให้นักวางแผนที่จะชอบกลยุทธ์บางอย่างมากกว่าคนอื่น ๆ

เนื่องจากตัวเลือกในเคียวรีที่หกดูเหมือนว่าจะนิยมการสแกนดัชนีบิตแมปมากกว่าการสแกนดัชนีคุณสามารถเขยิบนักวางแผนพร้อมกับSET enable_bitmapscan=offลองและนำมันกลับไปใช้พฤติกรรมที่คุณมีในตัวอย่างที่สามของคุณ

นี่คือวิธีการทำงานสำหรับฉัน:

postgres@[local]:5432:postgres:=# EXPLAIN (ANALYZE, BUFFERS)
SELECT * FROM docs
WHERE data @> '{"type": "type1"}'::JSONB
ORDER BY data->'index'
LIMIT 25;
                                                                QUERY PLAN                                                                 
-------------------------------------------------------------------------------------------------------------------------------------------
 Limit  (cost=656.06..656.12 rows=25 width=90) (actual time=117.338..117.343 rows=25 loops=1)
   Buffers: shared hit=3096
   ->  Sort  (cost=656.06..656.56 rows=200 width=90) (actual time=117.336..117.338 rows=25 loops=1)
         Sort Key: ((data -> 'index'::text))
         Sort Method: top-N heapsort  Memory: 28kB
         Buffers: shared hit=3096
         ->  Bitmap Heap Scan on docs  (cost=17.55..650.41 rows=200 width=90) (actual time=12.838..80.584 rows=99973 loops=1)
               Recheck Cond: (data @> '{"type": "type1"}'::jsonb)
               Heap Blocks: exact=3077
               Buffers: shared hit=3096
               ->  Bitmap Index Scan on docs_data_idx  (cost=0.00..17.50 rows=200 width=0) (actual time=12.469..12.469 rows=99973 loops=1)
                     Index Cond: (data @> '{"type": "type1"}'::jsonb)
                     Buffers: shared hit=19
 Planning time: 0.088 ms
 Execution time: 117.405 ms
(15 rows)

Time: 117.813 ms
postgres@[local]:5432:postgres:=# SET enable_bitmapscan = off;
SET
Time: 0.130 ms
postgres@[local]:5432:postgres:=# EXPLAIN (ANALYZE, BUFFERS)
SELECT * FROM docs
WHERE data @> '{"type": "type1"}'::JSONB
ORDER BY data->'index'
LIMIT 25;
                                                               QUERY PLAN                                                               
----------------------------------------------------------------------------------------------------------------------------------------
 Limit  (cost=0.42..1320.48 rows=25 width=90) (actual time=0.017..0.050 rows=25 loops=1)
   Buffers: shared hit=4
   ->  Index Scan using docs_data_index_idx on docs  (cost=0.42..10560.94 rows=200 width=90) (actual time=0.015..0.045 rows=25 loops=1)
         Filter: (data @> '{"type": "type1"}'::jsonb)
         Rows Removed by Filter: 27
         Buffers: shared hit=4
 Planning time: 0.083 ms
 Execution time: 0.071 ms
(8 rows)

Time: 0.402 ms
postgres@[local]:5432:postgres:=#

หากคุณกำลังมองหาเส้นทางนี้ให้แน่ใจว่าได้ปิดการใช้งานการสแกนเฉพาะสำหรับการค้นหาที่แสดงพฤติกรรมเช่นนี้มิฉะนั้นคุณจะได้รับพฤติกรรมที่ไม่ดีกับแผนการสืบค้นอื่น ๆ ด้วย การทำอะไรแบบนี้น่าจะใช้ได้ดี:

BEGIN;
SET enable_bitmapscan=off;
SELECT * FROM docs
WHERE data @> '{"type": "type1"}'::JSONB
ORDER BY data->'index'
LIMIT 25;
SET enable_bitmapscan=on;
COMMIT;

หวังว่าจะช่วย =)

— Kassandry
แหล่งที่มา

ฉันไม่แน่ใจว่าฉันเข้าใจคุณถูกต้องหรือไม่ (ฉันไม่คุ้นเคยกับ PG internals) - พฤติกรรมนี้เกิดจากความสำคัญต่ำในช่อง "พิมพ์" ในคอลัมน์ jsonb (และเกิดจากอัตราสถิติคงที่) ใช่ไหม และมันก็หมายความว่าถ้าฉันต้องการเพิ่มประสิทธิภาพการสืบค้นของฉันฉันต้องทราบความสำคัญโดยประมาณของฟิลด์ jsonb ที่ฉันกำลังสอบถามด้วยเพื่อตัดสินใจว่าควรเปิดใช้งานหรือไม่ใช่ไหม?

— user606521

ใช่คุณดูเหมือนจะเข้าใจสิ่งนี้ในทั้งสองสิ่ง การเลือกแบบฐาน 1% นั้นนิยมดูที่ฟิลด์ในส่วนWHEREคำสั่งในดัชนีจินเพราะเชื่อว่ามันจะส่งคืนแถวที่น้อยลงซึ่งไม่เป็นความจริง เนื่องจากคุณสามารถประมาณจำนวนแถวได้ดีขึ้นคุณจะเห็นได้ว่าเมื่อคุณทำเช่นORDER BY data->'index' LIMIT 25นั้นการสแกนรายการแรก ๆ ของดัชนีอื่น ๆ (50 หรือมากกว่านั้นโดยที่มีแถวที่ถูกโยนทิ้งไป) จะส่งผลให้แถวมีจำนวนน้อยลง นักวางแผนที่ไม่ควรลองใช้ผลลัพธ์บิตแมปสแกนในแผนการสืบค้นที่รวดเร็วกว่าที่ใช้อยู่ หวังว่าจะล้างสิ่งต่าง ๆ =)

— Kassandry

มีข้อมูลที่ชัดเจนเพิ่มเติมที่นี่: ฐานข้อมูลดอท คอม 2015/15/21/tag-all-things-part-3.htmlและในการนำเสนอนี้thebuild.com/presentations/json2015-pgconfus.pdfเพื่อช่วยเช่นกัน

— Kassandry

งานเดียวที่ฉันรู้คือมาจาก Oleg Bartunov, Tedor Sigaev และ Alexander Kotorov ในการขยายJsQueryและการปรับปรุงการเลือกสรร ด้วยโชคใด ๆ มันทำให้มันกลายเป็นแกนหลัก PostgreSQL ใน 9.6 หรือใหม่กว่า

— Kassandry

ฉันเสนอตัวเลข 1% จากอีเมลในคำตอบของฉันจาก Josh Berkus สมาชิกทีม PostgreSQL Core ที่ไหนที่มาจากต้องมีความเข้าใจลึกภายในมากเกินกว่าที่ฉันมีอยู่ในปัจจุบันขอโทษ = (คุณสามารถลองตอบกลับpgsql-performance@postgresql.orgหรือตรวจสอบบน freenode IRC #postgresqlว่าตัวเลขนั้นมาจากไหน

— Kassandry