255 lines
14 KiB
SQL
255 lines
14 KiB
SQL
CREATE DATABASE db_gbk TEMPLATE template0 encoding 'GBK' lc_ctype 'zh_CN.GBK' lc_collate 'zh_CN.GBK';
|
|
\c db_gbk
|
|
set client_encoding='UTF8';
|
|
show client_encoding;
|
|
show server_encoding;
|
|
------------------------------ basic synax for text search configuration -------------------------------
|
|
--
|
|
--- Pound
|
|
--
|
|
-- CREATE TEXT SEARCH CONFIGURATION
|
|
create text search configuration pound_gbk(parser=pound) with (split_flag = '#');
|
|
ALTER TEXT SEARCH CONFIGURATION pound_gbk ADD MAPPING FOR zh_words, en_word, numeric, alnum, grapsymbol, multisymbol WITH simple;
|
|
|
|
select cfgname, cfoptions from pg_ts_config where cfgname = 'pound_gbk';
|
|
|
|
SELECT to_tsvector('pound_gbk', '京东#淘宝#滴滴#爱奇艺#芒果TV');
|
|
|
|
-- SET THE SPLIT FLAG to '@'
|
|
alter text search configuration pound_gbk set (split_flag = '@');
|
|
select cfgname, cfoptions from pg_ts_config where cfgname = 'pound_gbk';
|
|
SELECT to_tsvector('pound_gbk', '京东@淘宝@滴滴@爱奇艺@芒果TV');
|
|
|
|
-- SET THE SPLIT FLAG to '$'
|
|
alter text search configuration pound_gbk set (split_flag = '$');
|
|
select cfgname, cfoptions from pg_ts_config where cfgname = 'pound_gbk';
|
|
SELECT to_tsvector('pound_gbk', '京东$淘宝$滴滴$爱奇艺$芒果TV');
|
|
|
|
-- SET THE SPLIT FLAG to '%'
|
|
alter text search configuration pound_gbk set (split_flag = '%');
|
|
select cfgname, cfoptions from pg_ts_config where cfgname = 'pound_gbk';
|
|
SELECT to_tsvector('pound_gbk', '京东%淘宝%滴滴%爱奇艺%芒果TV');
|
|
|
|
-- SET THE SPLIT FLAG to '/'
|
|
alter text search configuration pound_gbk set (split_flag = '/');
|
|
select cfgname, cfoptions from pg_ts_config where cfgname = 'pound_gbk';
|
|
SELECT to_tsvector('pound_gbk', '京东/淘宝/滴滴/爱奇艺/芒果TV');
|
|
|
|
-- FOR UNSUPPORTED SPLPT_FLAG, REPORT ERROR
|
|
alter text search configuration pound_gbk set (split_flag = ',');
|
|
|
|
-- WHEN SPLIT FLAG IS NOT SINGLE CHARACTER, REPORT ERROR
|
|
alter text search configuration pound_gbk set (split_flag = '#@');
|
|
|
|
-- WHEN SPLIT FLAG IS NULL CHARACTER, REPORT ERROR
|
|
alter text search configuration pound_gbk set (split_flag = '');
|
|
|
|
-- WHEN SINGLE TOKEN EXCEED 256 CHARACTERS, SPLIT DIRECTLY EVEN NO SPLIT FLAG IS FOUND
|
|
select to_tsvector('pound_gbk','1981年8月26日,美国宇宙飞船“旅行者2号”飞过土星,取得了一系列探测成果,其中包括发现了土星的第17颗卫星——土卫17。所以,今天我们就来说一说“旅行者2号”上的“地球之音”唱片。1977年8月和9月,人类成功发射了“旅行者1号”和“旅行者2号”探测器,再次向外星人作了更详细的自我介绍。这次,它们各自携带了一张称为“地球之音”的唱片,上面录制了丰富的地球信息。这两张唱片都是镀金铜质的,直径为30.5厘米。唱片上录有115幅照片和图表,35种各类声音,近60种语言的问候语和27首世界著名乐曲等。115幅照片中包括我国八达岭长城');
|
|
select to_tsvector('pound_gbk','1981年8月26日,美国宇宙飞船“旅行者2号”飞过土星,取得了一系列探测成果,其中包括发现了土星的第17颗卫星——土卫17。所以,今天我们就来说一说“旅行者2号”上的“地球之音”唱片。1977年8月和9月,人类成功发射了“旅行者1号”和“旅行者2号”探测器,再次向外星人作了更详细的自我介绍。这次,它们各自携带了一张称为“地球之音”的唱片,上面录制了丰富的地球信息。这两张唱片都是镀金铜质的,直径为30.5厘米。唱片上录有115幅照片和图表,35种各类声音,近60种语言的问候语和27首世界著名乐曲等。115幅///照片中包括我国八达岭长城');
|
|
|
|
drop text search configuration pound_gbk;
|
|
create text search configuration pound_gbk(parser=pound) with (split_flag = '#');
|
|
ALTER TEXT SEARCH CONFIGURATION pound_gbk ADD MAPPING FOR zh_words, en_word, numeric, alnum, grapsymbol, multisymbol WITH simple;
|
|
|
|
select cfgname, cfoptions from pg_ts_config where cfgname = 'pound_gbk';
|
|
|
|
SELECT to_tsvector('pound_gbk', '京东#淘宝#滴滴#爱奇艺#芒果TV');
|
|
|
|
--
|
|
--- N-gram
|
|
--
|
|
-- CREATE TEXT SEARCH CONFIGURATION
|
|
create text search configuration ngram1(parser=ngram) with (punctuation_ignore = on, gram_size = 2, grapsymbol_ignore = on);
|
|
alter text search configuration ngram1 ADD MAPPING FOR zh_words, en_word, numeric, alnum, grapsymbol, multisymbol with simple;
|
|
|
|
select cfgname, cfoptions from pg_ts_config where cfgname = 'ngram1';
|
|
|
|
SELECT to_tsvector('ngram1', '画,龙&点睛');
|
|
|
|
--punctuation_ignore
|
|
alter text search configuration ngram1 set (punctuation_ignore = off);
|
|
SELECT to_tsvector('ngram1', '画,龙&点睛');
|
|
|
|
--punctuation_ignore
|
|
alter text search configuration ngram1 set (grapsymbol_ignore = off);
|
|
SELECT to_tsvector('ngram1', '画,龙&点睛');
|
|
|
|
--punctuation_ignore
|
|
alter text search configuration ngram1 set (gram_size = 3);
|
|
SELECT to_tsvector('ngram1', '画,龙&点睛');
|
|
|
|
drop text search configuration ngram1;
|
|
create text search configuration ngram1(parser=ngram);
|
|
alter text search configuration ngram1 ADD MAPPING FOR zh_words, en_word, numeric, alnum, grapsymbol, multisymbol with simple;
|
|
select cfgname, cfoptions from pg_ts_config where cfgname = 'ngram1';
|
|
SELECT to_tsvector('ngram1', '画,龙&点睛');
|
|
|
|
|
|
--------------------------- basic operation for text search parser ----------------------------------
|
|
--
|
|
---- N-gram parser
|
|
--
|
|
set ngram_gram_size = 2;
|
|
set ngram_grapsymbol_ignore = off;
|
|
set ngram_punctuation_ignore = off;
|
|
|
|
select ts_parse('ngram', 'Uber提供了电脑版、手机版&平板电脑等版本');
|
|
|
|
-- configuration parameter: ngram.punctuation_ignore
|
|
set ngram_punctuation_ignore = on;
|
|
select ts_parse('ngram', 'Uber提供了电脑版、手机版&平板电脑等版本');
|
|
|
|
-- configuration parameter: ngram.grapsymbol_ignore
|
|
set ngram_grapsymbol_ignore = on;
|
|
select ts_parse('ngram', 'Uber提供了电脑版、手机版&平板电脑等版本');
|
|
|
|
-- configuration parameter: ngram.grapsymbol_ignore
|
|
set ngram_gram_size = 4;
|
|
select ts_parse('ngram', 'Uber提供了电脑版、手机版&平板电脑等版本');
|
|
|
|
select ts_token_type('ngram');
|
|
|
|
------------------------------- basic operation for text search datatype -----------------------
|
|
--
|
|
--- pound
|
|
--
|
|
select to_tsvector('pound_gbk', 'Uber#滴滴#爱奇艺#芒果TV');
|
|
select to_tsquery('pound_gbk','芒果TV');
|
|
|
|
--
|
|
---- N-gram
|
|
--
|
|
select to_tsvector('ngram1','辽GQQ360');
|
|
select to_tsquery('ngram1','360');
|
|
|
|
------------------------------- basic operation for text search operator -----------------------
|
|
--
|
|
--- pound
|
|
--
|
|
select to_tsvector('pound_gbk','Uber#滴滴#爱奇艺#芒果TV') @@ to_tsquery('pound_gbk','芒果TV'); -- true
|
|
select to_tsvector('pound_gbk','Uber#滴滴#爱奇艺#芒果TV') @@@ to_tsquery('pound_gbk','芒果TV'); -- true
|
|
|
|
select to_tsvector('pound_gbk','芒果TV') @@ to_tsquery('pound_gbk','QQ'); -- false
|
|
select (to_tsvector('pound_gbk','滴滴') || to_tsvector('pound_gbk','爱奇艺')) @@ to_tsquery('pound_gbk','滴滴'); -- true
|
|
|
|
select to_tsquery('pound_gbk','淘宝') && to_tsquery('pound_gbk','爱奇艺');
|
|
select to_tsquery('pound_gbk','淘宝') || to_tsquery('pound_gbk','爱奇艺');
|
|
|
|
select to_tsvector('pound_gbk','Uber#滴滴#爱奇艺#芒果TV') @@ (to_tsquery('pound_gbk','淘宝')); -- false
|
|
select to_tsvector('pound_gbk','Uber#滴滴#爱奇艺#芒果TV') @@ (to_tsquery('pound_gbk','爱奇艺')); -- true
|
|
select to_tsvector('pound_gbk','Uber#滴滴#爱奇艺#芒果TV') @@ (to_tsquery('pound_gbk','淘宝') || to_tsquery('pound_gbk','爱奇艺')); -- true
|
|
select to_tsvector('pound_gbk','Uber#滴滴#爱奇艺#芒果TV') @@ (to_tsquery('pound_gbk','淘宝') && to_tsquery('pound_gbk','爱奇艺')); --false
|
|
|
|
select !!to_tsquery('pound_gbk','爱奇艺');
|
|
select to_tsvector('pound_gbk','Uber#滴滴#爱奇艺#芒果TV') @@ !!to_tsquery('pound_gbk','爱奇艺'); -- false
|
|
select to_tsvector('pound_gbk','Uber#滴滴#爱奇艺#芒果TV') @@ !!to_tsquery('pound_gbk','淘宝'); --true
|
|
|
|
select to_tsquery('pound_gbk','爱奇艺') @> to_tsquery('pound_gbk','Uber#滴滴#爱奇艺#芒果TV'); -- false
|
|
select to_tsquery('pound_gbk','Uber#滴滴#爱奇艺#芒果TV') @> to_tsquery('pound_gbk','爱奇艺'); -- true
|
|
|
|
select to_tsquery('pound_gbk','爱奇艺') <@ to_tsquery('pound_gbk','Uber#滴滴#爱奇艺#芒果TV'); -- true
|
|
select to_tsquery('pound_gbk','Uber#滴滴#爱奇艺#芒果TV') <@ to_tsquery('pound_gbk','爱奇艺'); -- false
|
|
|
|
--
|
|
---- N-gram
|
|
--
|
|
select to_tsvector('ngram1','辽GQQ360') @@ to_tsquery('ngram1','360'); -- true
|
|
select to_tsvector('ngram1','辽GQQ360') @@@ to_tsquery('ngram1','360'); -- true
|
|
|
|
select to_tsvector('ngram1','360') @@ to_tsquery('ngram1','QQ'); -- false
|
|
select (to_tsvector('ngram1','辽GQQ') || to_tsvector('ngram1','360')) @@ to_tsquery('ngram1','QQ'); -- true
|
|
|
|
select to_tsquery('ngram1','480') && to_tsquery('ngram1','辽GQQ');
|
|
select to_tsquery('ngram1','480') || to_tsquery('ngram1','辽GQQ');
|
|
|
|
select to_tsvector('ngram1','辽GQQ360') @@ (to_tsquery('ngram1','480')); -- false
|
|
select to_tsvector('ngram1','辽GQQ360') @@ (to_tsquery('ngram1','辽GQQ')); -- true
|
|
select to_tsvector('ngram1','辽GQQ360') @@ (to_tsquery('ngram1','480') || to_tsquery('ngram1','辽GQQ')); -- true
|
|
select to_tsvector('ngram1','辽GQQ360') @@ (to_tsquery('ngram1','480') && to_tsquery('ngram1','辽GQQ')); -- false
|
|
|
|
select !!to_tsquery('ngram1','360') ;
|
|
select to_tsvector('ngram1','辽GQQ360') @@ !!to_tsquery('ngram1','360'); -- false
|
|
select to_tsvector('ngram1','辽GQQ360') @@ !!to_tsquery('ngram1','480'); -- true
|
|
|
|
select to_tsquery('ngram1','360') @> to_tsquery('ngram1','辽GQQ360'); -- false
|
|
select to_tsquery('ngram1','辽GQQ360') @> to_tsquery('ngram1','360'); -- true
|
|
|
|
select to_tsquery('ngram1','360') <@ to_tsquery('ngram1','辽GQQ360'); -- true
|
|
select to_tsquery('ngram1','辽GQQ360') <@ to_tsquery('ngram1','360'); -- false
|
|
|
|
------------------------------- basic operation for text search function -----------------------
|
|
-- pound
|
|
select get_current_ts_config(); -- english
|
|
select length(to_tsvector('pound_gbk','Uber#滴滴#爱奇艺#芒果TV')); -- 4
|
|
select length(to_tsvector('pound_gbk','Uber#滴滴#爱奇艺#芒果TV#QQ')); -- 5
|
|
|
|
select numnode(to_tsquery('pound_gbk','Uber#滴滴#爱奇艺#芒果TV'));
|
|
select numnode(to_tsquery('pound_gbk','芒果TV') || to_tsquery('pound_gbk','滴滴'));
|
|
|
|
-- select plainto_tsquery('pound_gbk', 'Uber##滴滴# 爱奇艺##芒果TV');
|
|
-- select plainto_tsquery('pound_gbk', 'Uber#滴滴#爱奇艺#芒果TV');
|
|
|
|
select querytree(to_tsquery('pound_gbk','Uber#滴滴#爱奇艺#芒果TV'));
|
|
select querytree(to_tsquery('pound_gbk','滴滴') && to_tsquery('pound_gbk','爱奇艺'));
|
|
select querytree(to_tsquery('pound_gbk','滴滴') || to_tsquery('pound_gbk','爱奇艺'));
|
|
|
|
select strip(to_tsvector('pound_gbk','Uber#滴滴#爱奇艺#芒果TV'));
|
|
select ts_headline('pound_gbk', 'Uber#滴滴#爱奇艺#芒果TV', to_tsquery('pound_gbk','Uber'));
|
|
|
|
select ts_rank(to_tsvector('pound_gbk', 'Uber#滴滴#爱奇艺#芒果TV'), to_tsquery('pound_gbk','Uber#滴滴#爱奇艺#芒果TV'));
|
|
select ts_rank(to_tsvector('pound_gbk', 'Uber#滴滴#爱奇艺#芒果TV'), to_tsquery('pound_gbk','Uber'));
|
|
select ts_rank(to_tsvector('pound_gbk', 'Uber#滴滴#爱奇艺#芒果TV'), to_tsquery('pound_gbk','滴滴'));
|
|
|
|
select ts_rank_cd(to_tsvector('pound_gbk', 'Uber#滴滴#爱奇艺#芒果TV'), to_tsquery('pound_gbk','Uber#滴滴#爱奇艺#芒果TV'));
|
|
select ts_rank_cd(to_tsvector('pound_gbk', 'Uber#滴滴#爱奇艺#芒果TV'), to_tsquery('pound_gbk','Uber'));
|
|
select ts_rank_cd(to_tsvector('pound_gbk', 'Uber#滴滴#爱奇艺#芒果TV'), to_tsquery('pound_gbk','滴滴'));
|
|
|
|
select ts_rewrite(to_tsquery('pound_gbk','Uber#滴滴#爱奇艺#芒果TV'), to_tsquery('pound_gbk','Uber'), to_tsquery('pound_gbk','爱奇艺'));
|
|
|
|
--N-gram
|
|
select get_current_ts_config(); -- english
|
|
select length(to_tsvector('ngram1','辽GQQ360')); -- 6
|
|
select length(to_tsvector('ngram1','辽GQQ888')); -- 5
|
|
|
|
select numnode(to_tsquery('ngram1','辽GQQ360')); -- 11
|
|
select numnode(to_tsquery('ngram1','480') || to_tsquery('ngram1','辽GQQ')); -- 9
|
|
|
|
alter text search configuration ngram1 set (grapsymbol_ignore = off);
|
|
alter text search configuration ngram1 set (punctuation_ignore = on);
|
|
alter text search configuration ngram1 set (gram_size = 2);
|
|
|
|
select cfgname, cfoptions from pg_ts_config where cfgname = 'ngram1';
|
|
|
|
select plainto_tsquery('ngram1', '车,牌,号');
|
|
select plainto_tsquery('ngram1', '车,牌,号');
|
|
|
|
select querytree(to_tsquery('ngram1','中国人民银行'));
|
|
select querytree(to_tsquery('ngram1','中国人') && to_tsquery('ngram1','民银行'));
|
|
select querytree(to_tsquery('ngram1','中国人') || to_tsquery('ngram1','民银行'));
|
|
|
|
select strip(to_tsvector('ngram1','中国人民银行'));
|
|
select ts_headline('ngram1', '中国人民银行', to_tsquery('ngram1','中国人'));
|
|
|
|
select ts_rank(to_tsvector('ngram1', '中国人民银行'), to_tsquery('ngram1','中国人民银行'));
|
|
select ts_rank(to_tsvector('ngram1', '中国人民银行'), to_tsquery('ngram1','中国人'));
|
|
select ts_rank(to_tsvector('ngram1', '中国人民银行'), to_tsquery('ngram1','中人'));
|
|
|
|
select ts_rank_cd(to_tsvector('ngram1', '中国人民银行'), to_tsquery('ngram1','中国人民银行'));
|
|
select ts_rank_cd(to_tsvector('ngram1', '中国人民银行'), to_tsquery('ngram1','中国人'));
|
|
select ts_rank_cd(to_tsvector('ngram1', '中国人民银行'), to_tsquery('ngram1','中人'));
|
|
|
|
select ts_rewrite(to_tsquery('ngram1','中国人民银行'), to_tsquery('ngram1','银行'), to_tsquery('ngram1','政府'));
|
|
|
|
-------------------------------- add test cases for deprecated zhparser -------------------------
|
|
select to_tsvector('zhparser', '城乡建设');
|
|
select ts_parse('zhparser', '版权所有 CopyRight 2006-2015 酷狗音乐');
|
|
select to_tsquery('zhparser','中国人民银行');
|
|
select * from ts_token_type('zhparser');
|
|
|
|
|
|
drop text search configuration ngram1;
|
|
drop text search configuration pound_gbk;
|
|
\c postgres
|
|
DROP DATABASE db_gbk;
|