2726 lines
		
	
	
		
			129 KiB
		
	
	
	
		
			HTML
		
	
	
	
	
	
			
		
		
	
	
			2726 lines
		
	
	
		
			129 KiB
		
	
	
	
		
			HTML
		
	
	
	
	
	
| <!DOCTYPE html>
 | |
| <html><head>
 | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0">
 | |
| <meta http-equiv="content-type" content="text/html; charset=UTF-8">
 | |
| <link href="sqlite.css" rel="stylesheet">
 | |
| <title>SQLite FTS5 Extension</title>
 | |
| <!-- path= -->
 | |
| </head>
 | |
| <body>
 | |
| <div class=nosearch>
 | |
| <a href="index.html">
 | |
| <img class="logo" src="images/sqlite370_banner.gif" alt="SQLite" border="0">
 | |
| </a>
 | |
| <div><!-- IE hack to prevent disappearing logo --></div>
 | |
| <div class="tagline desktoponly">
 | |
| Small. Fast. Reliable.<br>Choose any three.
 | |
| </div>
 | |
| <div class="menu mainmenu">
 | |
| <ul>
 | |
| <li><a href="index.html">Home</a>
 | |
| <li class='mobileonly'><a href="javascript:void(0)" onclick='toggle_div("submenu")'>Menu</a>
 | |
| <li class='wideonly'><a href='about.html'>About</a>
 | |
| <li class='desktoponly'><a href="docs.html">Documentation</a>
 | |
| <li class='desktoponly'><a href="download.html">Download</a>
 | |
| <li class='wideonly'><a href='copyright.html'>License</a>
 | |
| <li class='desktoponly'><a href="support.html">Support</a>
 | |
| <li class='desktoponly'><a href="prosupport.html">Purchase</a>
 | |
| <li class='search' id='search_menubutton'>
 | |
| <a href="javascript:void(0)" onclick='toggle_search()'>Search</a>
 | |
| </ul>
 | |
| </div>
 | |
| <div class="menu submenu" id="submenu">
 | |
| <ul>
 | |
| <li><a href='about.html'>About</a>
 | |
| <li><a href='docs.html'>Documentation</a>
 | |
| <li><a href='download.html'>Download</a>
 | |
| <li><a href='support.html'>Support</a>
 | |
| <li><a href='prosupport.html'>Purchase</a>
 | |
| </ul>
 | |
| </div>
 | |
| <div class="searchmenu" id="searchmenu">
 | |
| <form method="GET" action="search">
 | |
| <select name="s" id="searchtype">
 | |
| <option value="d">Search Documentation</option>
 | |
| <option value="c">Search Changelog</option>
 | |
| </select>
 | |
| <input type="text" name="q" id="searchbox" value="">
 | |
| <input type="submit" value="Go">
 | |
| </form>
 | |
| </div>
 | |
| </div>
 | |
| <script>
 | |
| function toggle_div(nm) {
 | |
| var w = document.getElementById(nm);
 | |
| if( w.style.display=="block" ){
 | |
| w.style.display = "none";
 | |
| }else{
 | |
| w.style.display = "block";
 | |
| }
 | |
| }
 | |
| function toggle_search() {
 | |
| var w = document.getElementById("searchmenu");
 | |
| if( w.style.display=="block" ){
 | |
| w.style.display = "none";
 | |
| } else {
 | |
| w.style.display = "block";
 | |
| setTimeout(function(){
 | |
| document.getElementById("searchbox").focus()
 | |
| }, 30);
 | |
| }
 | |
| }
 | |
| function div_off(nm){document.getElementById(nm).style.display="none";}
 | |
| window.onbeforeunload = function(e){div_off("submenu");}
 | |
| /* Disable the Search feature if we are not operating from CGI, since */
 | |
| /* Search is accomplished using CGI and will not work without it. */
 | |
| if( !location.origin || !location.origin.match || !location.origin.match(/http/) ){
 | |
| document.getElementById("search_menubutton").style.display = "none";
 | |
| }
 | |
| /* Used by the Hide/Show button beside syntax diagrams, to toggle the */
 | |
| function hideorshow(btn,obj){
 | |
| var x = document.getElementById(obj);
 | |
| var b = document.getElementById(btn);
 | |
| if( x.style.display!='none' ){
 | |
| x.style.display = 'none';
 | |
| b.innerHTML='show';
 | |
| }else{
 | |
| x.style.display = '';
 | |
| b.innerHTML='hide';
 | |
| }
 | |
| return false;
 | |
| }
 | |
| </script>
 | |
| </div>
 | |
| <div class=fancy>
 | |
| <div class=nosearch>
 | |
| <div class="fancy_title">
 | |
| SQLite FTS5 Extension
 | |
| </div>
 | |
| <div class="fancy_toc">
 | |
| <a onclick="toggle_toc()">
 | |
| <span class="fancy_toc_mark" id="toc_mk">►</span>
 | |
| Table Of Contents
 | |
| </a>
 | |
| <div id="toc_sub"><div class="fancy-toc1"><a href="#overview_of_fts5">1. Overview of FTS5</a></div>
 | |
| <div class="fancy-toc1"><a href="#compiling_and_using_fts5">2. Compiling and Using FTS5</a></div>
 | |
| <div class="fancy-toc2"><a href="#building_fts5_as_part_of_sqlite">2.1. Building FTS5 as part of SQLite</a></div>
 | |
| <div class="fancy-toc2"><a href="#building_a_loadable_extension">2.2. Building a Loadable Extension</a></div>
 | |
| <div class="fancy-toc1"><a href="#full_text_query_syntax">3. Full-text Query Syntax</a></div>
 | |
| <div class="fancy-toc2"><a href="#fts5_strings">3.1. FTS5 Strings</a></div>
 | |
| <div class="fancy-toc2"><a href="#fts5_phrases">3.2. FTS5 Phrases</a></div>
 | |
| <div class="fancy-toc2"><a href="#fts5_prefix_queries">3.3. FTS5 Prefix Queries</a></div>
 | |
| <div class="fancy-toc2"><a href="#fts5_initial_token_queries">3.4. FTS5 Initial Token Queries</a></div>
 | |
| <div class="fancy-toc2"><a href="#fts5_near_queries">3.5. FTS5 NEAR Queries</a></div>
 | |
| <div class="fancy-toc2"><a href="#fts5_column_filters">3.6. FTS5 Column Filters</a></div>
 | |
| <div class="fancy-toc2"><a href="#fts5_boolean_operators">3.7. FTS5 Boolean Operators</a></div>
 | |
| <div class="fancy-toc1"><a href="#fts5_table_creation_and_initialization">4. FTS5 Table Creation and Initialization</a></div>
 | |
| <div class="fancy-toc2"><a href="#the_unindexed_column_option">4.1. The UNINDEXED column option</a></div>
 | |
| <div class="fancy-toc2"><a href="#prefix_indexes">4.2. Prefix Indexes</a></div>
 | |
| <div class="fancy-toc2"><a href="#tokenizers">4.3. Tokenizers</a></div>
 | |
| <div class="fancy-toc3"><a href="#unicode61_tokenizer">4.3.1. Unicode61 Tokenizer</a></div>
 | |
| <div class="fancy-toc3"><a href="#ascii_tokenizer">4.3.2. Ascii Tokenizer</a></div>
 | |
| <div class="fancy-toc3"><a href="#porter_tokenizer">4.3.3. Porter Tokenizer</a></div>
 | |
| <div class="fancy-toc2"><a href="#external_content_and_contentless_tables">4.4. External Content and Contentless Tables</a></div>
 | |
| <div class="fancy-toc3"><a href="#contentless_tables">4.4.1. Contentless Tables</a></div>
 | |
| <div class="fancy-toc3"><a href="#external_content_tables">4.4.2. External Content Tables</a></div>
 | |
| <div class="fancy-toc2"><a href="#the_columnsize_option">4.5. The Columnsize Option</a></div>
 | |
| <div class="fancy-toc2"><a href="#the_detail_option">4.6. The Detail Option</a></div>
 | |
| <div class="fancy-toc1"><a href="#_auxiliary_functions_">5.  Auxiliary Functions </a></div>
 | |
| <div class="fancy-toc2"><a href="#built_in_auxiliary_functions">5.1. Built-in Auxiliary Functions</a></div>
 | |
| <div class="fancy-toc3"><a href="#the_bm25_function">5.1.1. The bm25() function</a></div>
 | |
| <div class="fancy-toc3"><a href="#the_highlight_function">5.1.2. The highlight() function</a></div>
 | |
| <div class="fancy-toc3"><a href="#the_snippet_function">5.1.3. The snippet() function</a></div>
 | |
| <div class="fancy-toc2"><a href="#sorting_by_auxiliary_function_results">5.2. Sorting by Auxiliary Function Results</a></div>
 | |
| <div class="fancy-toc1"><a href="#special_insert_commands">6. Special INSERT Commands</a></div>
 | |
| <div class="fancy-toc2"><a href="#the_automerge_configuration_option">6.1. The 'automerge' Configuration Option</a></div>
 | |
| <div class="fancy-toc2"><a href="#the_crisismerge_configuration_option">6.2. The 'crisismerge' Configuration Option</a></div>
 | |
| <div class="fancy-toc2"><a href="#the_delete_command">6.3. The 'delete' Command</a></div>
 | |
| <div class="fancy-toc2"><a href="#the_delete_all_command">6.4. The 'delete-all' Command</a></div>
 | |
| <div class="fancy-toc2"><a href="#the_integrity_check_command">6.5. The 'integrity-check' Command</a></div>
 | |
| <div class="fancy-toc2"><a href="#the_merge_command">6.6. The 'merge' Command</a></div>
 | |
| <div class="fancy-toc2"><a href="#the_optimize_command">6.7. The 'optimize' Command</a></div>
 | |
| <div class="fancy-toc2"><a href="#the_pgsz_configuration_option">6.8. The 'pgsz' Configuration Option</a></div>
 | |
| <div class="fancy-toc2"><a href="#the_rank_configuration_option">6.9. The 'rank' Configuration Option</a></div>
 | |
| <div class="fancy-toc2"><a href="#the_rebuild_command">6.10. The 'rebuild' Command</a></div>
 | |
| <div class="fancy-toc2"><a href="#the_usermerge_configuration_option">6.11. The 'usermerge' Configuration Option</a></div>
 | |
| <div class="fancy-toc1"><a href="#extending_fts5">7. Extending FTS5</a></div>
 | |
| <div class="fancy-toc2"><a href="#custom_tokenizers">7.1. Custom Tokenizers</a></div>
 | |
| <div class="fancy-toc3"><a href="#synonym_support">7.1.1. Synonym Support</a></div>
 | |
| <div class="fancy-toc2"><a href="#custom_auxiliary_functions">7.2. Custom Auxiliary Functions</a></div>
 | |
| <div class="fancy-toc3"><a href="#_custom_auxiliary_functions_api_reference_">7.2.1.
 | |
| Custom Auxiliary Functions API Reference
 | |
| </a></div>
 | |
| <div class="fancy-toc1"><a href="#the_fts5vocab_virtual_table_module">8. The fts5vocab Virtual Table Module</a></div>
 | |
| <div class="fancy-toc1"><a href="#appendix_a">
 | |
| Appendix A: Comparison with FTS3/4
 | |
| </a></div>
 | |
| <div class="fancy-toc2"><a href="#_application_porting_guide_"> Application Porting Guide </a></div>
 | |
| <div class="fancy-toc3"><a href="#_changes_to_create_virtual_table_statements_"> Changes to CREATE VIRTUAL TABLE statements </a></div>
 | |
| <div class="fancy-toc3"><a href="#_changes_to_select_statements_"> Changes to SELECT statements </a></div>
 | |
| <div class="fancy-toc3"><a href="#_auxiliary_function_changes_"> Auxiliary Function Changes </a></div>
 | |
| <div class="fancy-toc3"><a href="#_other_issues"> Other Issues</a></div>
 | |
| <div class="fancy-toc2"><a href="#_summary_of_technical_differences_">
 | |
| Summary of Technical Differences
 | |
| </a></div>
 | |
| <div class="fancy-toc1"><a href="#appendix_b">
 | |
| Appendix B: Shadow tables created by FTS5
 | |
| </a></div>
 | |
| </div>
 | |
| </div>
 | |
| <script>
 | |
| function toggle_toc(){
 | |
| var sub = document.getElementById("toc_sub")
 | |
| var mk = document.getElementById("toc_mk")
 | |
| if( sub.style.display!="block" ){
 | |
| sub.style.display = "block";
 | |
| mk.innerHTML = "▼";
 | |
| } else {
 | |
| sub.style.display = "none";
 | |
| mk.innerHTML = "►";
 | |
| }
 | |
| }
 | |
| </script>
 | |
| </div>
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| <h1 id="overview_of_fts5"><span>1. </span>Overview of FTS5</h1>
 | |
| 
 | |
| <p>FTS5 is an SQLite <a href="c3ref/module.html">virtual table module</a> that provides 
 | |
| <a href="http://en.wikipedia.org/wiki/Full_text_search">full-text search</a>
 | |
| functionality to database applications. In their most elementary form, 
 | |
| full-text search engines allow the user to efficiently search a large 
 | |
| collection of documents for the subset that contain one or more instances of a
 | |
| search term. The search functionality provided to world wide web users by
 | |
| <a href="www.google.com">Google</a> is, among other things, a full-text search
 | |
| engine, as it allows users to search for all documents on the web that contain,
 | |
| for example, the term "fts5".
 | |
| 
 | |
| </p><p>To use FTS5, the user creates an FTS5 virtual table with one or more
 | |
| columns. For example:
 | |
| 
 | |
| </p><div class="codeblock"><pre>CREATE VIRTUAL TABLE email USING fts5(sender, title, body);
 | |
| </pre></div>
 | |
| 
 | |
| <p>It is an error to add types, constraints or <a href="lang_createtable.html#primkeyconst">PRIMARY KEY</a> declarations to 
 | |
| a CREATE VIRTUAL TABLE statement used to create an FTS5 table. Once created,
 | |
| an FTS5 table may be populated using <a href="lang_insert.html">INSERT</a>, <a href="lang_update.html">UPDATE</a> or <a href="lang_delete.html">DELETE</a> statements
 | |
| like any other table. Like any other table with no PRIMARY KEY declaration, an
 | |
| FTS5 table has an implicit INTEGER PRIMARY KEY field named rowid. 
 | |
| 
 | |
| </p><p>Not shown in the example above is that there are also 
 | |
| <a href="fts5.html#fts5_table_creation_and_initialization">various options</a> that may be provided to FTS5 as
 | |
| part of the CREATE VIRTUAL TABLE statement to configure various aspects of the
 | |
| new table. These may be used to modify the way in which the FTS5 table extracts
 | |
| terms from documents and queries, to create extra indexes on disk to speed up
 | |
| prefix queries, or to create an FTS5 table that acts as an index on content
 | |
| stored elsewhere.
 | |
| 
 | |
| </p><p>Once populated, there are three ways to execute a full-text query against
 | |
| the contents of an FTS5 table:
 | |
| 
 | |
| </p><ul><li> Using a MATCH operator in the WHERE clause of a SELECT statement, or
 | |
|     </li><li> Using an equals ("=") operator in the WHERE clause of a SELECT statement, or
 | |
|     </li><li> using the <a href="vtab.html#tabfunc2">table-valued function</a> syntax.
 | |
| </li></ul>
 | |
| 
 | |
| <p>If using the MATCH or = operators, the expression to the left of the MATCH
 | |
|    operator is usually the name of the FTS5 table (the exception is when 
 | |
|    <a href="fts5.html#fts5_column_filters">specifying a column-filter</a>). The expression on the right
 | |
|    must be a text value specifying the term to search for. For the table-valued
 | |
|    function syntax, the term to search for is specified as the first table argument.
 | |
|    For example:
 | |
| 
 | |
| </p><div class="codeblock"><pre><i>-- Query for all rows that contain at least once instance of the term</i>
 | |
| <i>-- "fts5" (in any column). The following three queries are equivalent.</i>
 | |
| SELECT * FROM email WHERE email MATCH 'fts5';
 | |
| SELECT * FROM email WHERE email = 'fts5';
 | |
| SELECT * FROM email('fts5');
 | |
| </pre></div>
 | |
| 
 | |
| <p> By default, FTS5 full-text searches are case-independent. Like any other
 | |
| SQL query that does not contain an ORDER BY clause, the example above returns
 | |
| results in an arbitrary order. To sort results by relevance (most to least
 | |
| relevant), an ORDER BY may be added to a full-text query as follows:
 | |
| 
 | |
| </p><div class="codeblock"><pre><i>-- Query for all rows that contain at least once instance of the term</i>
 | |
| <i>-- "fts5" (in any column). Return results in order from best to worst</i>
 | |
| <i>-- match.  </i>
 | |
| SELECT * FROM email WHERE email MATCH 'fts5' ORDER BY rank;
 | |
| </pre></div>
 | |
| 
 | |
| <p> As well as the column values and rowid of a matching row, an application
 | |
| may use <a href="fts5.html#_auxiliary_functions_">FTS5 auxiliary functions</a> to retrieve extra information regarding
 | |
| the matched row. For example, an auxiliary function may be used to retrieve
 | |
| a copy of a column value for a matched row with all instances of the matched
 | |
| term surrounded by html <b></b> tags. Auxiliary functions are
 | |
| invoked in the same way as SQLite <a href="lang_corefunc.html">scalar functions</a>, except that the name
 | |
| of the FTS5 table is specified as the first argument. For example:
 | |
| 
 | |
| </p><div class="codeblock"><pre><i>-- Query for rows that match "fts5". Return a copy of the "body" column</i>
 | |
| <i>-- of each row with the matches surrounded by <b></b> tags.</i>
 | |
| SELECT highlight(email, 2, '<b>', '</b>') FROM email('fts5');
 | |
| </pre></div>
 | |
| 
 | |
| <p>A description of the available auxiliary functions, and more details
 | |
| regarding configuration of the special "rank" column, are 
 | |
| <a href="fts5.html#_auxiliary_functions_">available below</a>. <a href="fts5.html#custom_auxiliary_functions">Custom auxiliary functions</a> may also be implemented in C and registered with
 | |
| FTS5, just as custom SQL functions may be registered with the SQLite core.
 | |
| 
 | |
| </p><p> As well as searching for all rows that contain a term, FTS5 allows 
 | |
| the user to search for rows that contain:
 | |
| 
 | |
| </p><ul>
 | |
|   <li> any terms that begin with a specified prefix,
 | |
|   </li><li> "phrases" - sequences of terms or prefix terms that must feature in a
 | |
|        document for it to match the query, 
 | |
|   </li><li> sets of terms, prefix terms or phrases that appear within a specified
 | |
|        proximity of each other (these are called "NEAR queries"), or
 | |
|   </li><li> boolean combinations of any of the above.
 | |
| </li></ul>
 | |
| 
 | |
| <p> Such advanced searches are requested by providing a more complicated 
 | |
| FTS5 query string as the text to the right of the MATCH operator (or =
 | |
| operator, or as the first argument to a table-valued function syntax). The 
 | |
| full query syntax is <a href="fts5.html#full_text_query_syntax">described here</a>.
 | |
| 
 | |
| </p><a name="compiling_and_using_fts5"></a>
 | |
| <h1 tags="FTS5 building" id="compiling_and_using_fts5"><span>2. </span>Compiling and Using FTS5</h1>
 | |
| 
 | |
| <h2 id="building_fts5_as_part_of_sqlite"><span>2.1. </span>Building FTS5 as part of SQLite</h2>
 | |
| 
 | |
| <p>As of <a href="releaselog/3_9_0.html">version 3.9.0</a> (2015-10-14),
 | |
| FTS5 is included as part of the SQLite <a href="amalgamation.html">amalgamation</a>.
 | |
| If using one of the two autoconf build system, FTS5 is
 | |
| enabled by specifying the "--enable-fts5" option when running the configure
 | |
| script.  (FTS5 is currently disabled by default for the
 | |
| source-tree configure script and enabled by default for
 | |
| the amalgamation configure script, but these defaults might
 | |
| change in the future.)
 | |
| 
 | |
| </p><p>Or, if sqlite3.c is compiled using some other build system, by arranging for
 | |
| the SQLITE_ENABLE_FTS5 pre-processor symbol to be defined.
 | |
| 
 | |
| </p><h2 id="building_a_loadable_extension"><span>2.2. </span>Building a Loadable Extension</h2>
 | |
| 
 | |
| <p>Alternatively, FTS5 may be built as a loadable extension.
 | |
| 
 | |
| </p><p>The canonical FTS5 source code consists of a series of *.c and other files
 | |
| in the "ext/fts5" directory of the SQLite source tree. A build process reduces
 | |
| this to just two files - "fts5.c" and "fts5.h" - which may be used to build an
 | |
| SQLite loadable extension.
 | |
| 
 | |
| </p><ol>
 | |
|   <li> Obtain the latest SQLite code from fossil.
 | |
|   </li><li> Create a Makefile as described in <a href="howtocompile.html">How To Compile SQLite</a>.
 | |
|   </li><li> Build the "fts5.c" target. Which also creates fts5.h.
 | |
| </li></ol>
 | |
| 
 | |
| <div class="codeblock"><pre>$ wget -c http://www.sqlite.org/src/tarball/SQLite-trunk.tgz?uuid=trunk -O SQLite-trunk.tgz
 | |
| .... output ...
 | |
| $ tar -xzf SQLite-trunk.tgz
 | |
| $ cd SQLite-trunk
 | |
| $ ./configure && make fts5.c
 | |
| ... lots of output ...
 | |
| $ ls fts5.[ch]
 | |
| fts5.c        fts5.h
 | |
| </pre></div>
 | |
| 
 | |
| <p>
 | |
|   The code in "fts5.c" may then be compiled into a loadable extension or
 | |
|   statically linked into an application as described in 
 | |
|   <a href="loadext.html#build">Compiling Loadable Extensions</a>. There are two entry points defined, both
 | |
|   of which do the same thing:
 | |
| 
 | |
| </p><ul>
 | |
|   <li> sqlite3_fts_init
 | |
|   </li><li> sqlite3_fts5_init
 | |
| </li></ul>
 | |
| 
 | |
| <p>
 | |
|   The other file, "fts5.h", is not required to compile the FTS5 extension. 
 | |
|   It is used by applications that implement <a href="fts5.html#extending_fts5">custom FTS5 tokenizers or auxiliary functions</a>.
 | |
| 
 | |
| </p><a name="full_text_query_syntax"></a>
 | |
| <h1 tags="FTS5 query syntax" id="full_text_query_syntax"><span>3. </span>Full-text Query Syntax</h1>
 | |
| 
 | |
| <p>
 | |
| The following block contains a summary of the FTS query syntax in BNF form.
 | |
| A detailed explanation follows.
 | |
| 
 | |
| </p><div class="codeblock"><pre><phrase>    := string [*]
 | |
| <phrase>    := <phrase> + <phrase>
 | |
| <neargroup> := NEAR ( <phrase> <phrase> ... [, N] )
 | |
| <query>     := [ [-] <colspec> :] [^] <phrase>
 | |
| <query>     := [ [-] <colspec> :] <neargroup>
 | |
| <query>     := [ [-] <colspec> :] ( <query> )
 | |
| <query>     := <query> AND <query>
 | |
| <query>     := <query> OR <query>
 | |
| <query>     := <query> NOT <query>
 | |
| <colspec>   := colname
 | |
| <colspec>   := { colname1 colname2 ... }
 | |
| </pre></div>
 | |
| 
 | |
| <a name="fts5_strings"></a>
 | |
| <h2 tags="FTS5 Strings" id="fts5_strings"><span>3.1. </span>FTS5 Strings</h2>
 | |
| <p>
 | |
| Within an FTS expression a <b>string</b> may be specified in one of two ways:
 | |
| 
 | |
| </p><ul>
 | |
|   <li> <p>By enclosing it in double quotes ("). Within a string, any embedded
 | |
|        double quote characters may be escaped SQL-style - by adding a second
 | |
|        double-quote character.
 | |
| 
 | |
|   </p></li><li> <p>As an FTS5 bareword that is not "AND", "OR" or "NOT" (case sensitive). 
 | |
|        An FTS5 bareword is a string of one or more consecutive characters that
 | |
|        are all either:
 | |
|        
 | |
|        </p><ul>
 | |
|          <li> Non-ASCII range characters (i.e. unicode codepoints greater 
 | |
|               than 127), or 
 | |
|          </li><li> One of the 52 upper and lower case ASCII characters, or
 | |
|          </li><li> One of the 10 decimal digit ASCII characters, or
 | |
|          </li><li> The underscore character (unicode codepoint 96).
 | |
|          </li><li> The substitute character (unicode codepoint 26).
 | |
|        </li></ul>
 | |
| 
 | |
|        Strings that include any other characters must be quoted. Characters
 | |
|        that are not currently allowed in barewords, are not quote characters and
 | |
|        do not currently serve any special purpose in FTS5 query expressions may
 | |
|        at some point in the future be allowed in barewords or used to implement
 | |
|        new query functionality. This means that queries that are currently
 | |
|        syntax errors because they include such a character outside of a quoted
 | |
|        string may be interpreted differently by some future version of FTS5.
 | |
| </li></ul>
 | |
| 
 | |
| <a name="fts5_phrases"></a>
 | |
| <h2 tags="FTS5 Phrases" id="fts5_phrases"><span>3.2. </span>FTS5 Phrases</h2>
 | |
| <p>
 | |
| FTS queries are made up of <b>phrases</b>. A phrase is an ordered list of 
 | |
| one or more tokens. A string is transformed into a phrase by passing it to
 | |
| the FTS table tokenizer. Two phrases can be concatenated into a single 
 | |
| large phrase using the "+" operator. For example, assuming the tokenizer
 | |
| module being used tokenizes the input "one.two.three" to three separate
 | |
| tokens, the following four queries all specify the same phrase:
 | |
| 
 | |
| </p><div class="codeblock"><pre>... MATCH '"one two three"'
 | |
| ... MATCH 'one + two + three'
 | |
| ... MATCH '"one two" + three'
 | |
| ... MATCH 'one.two.three'
 | |
| </pre></div>
 | |
| 
 | |
| <p>
 | |
| A phrase matches a document if the document contains at least one sub-sequence
 | |
| of tokens that matches the sequence of tokens that make up the phrase.
 | |
| 
 | |
| </p><a name="fts5_prefix_queries"></a>
 | |
| <h2 tags="FTS5 prefix queries" id="fts5_prefix_queries"><span>3.3. </span>FTS5 Prefix Queries</h2>
 | |
| <p>
 | |
| If a "*" character follows a string within an FTS expression, then the final
 | |
| token extracted from the string is marked as a <b>prefix token</b>. As you
 | |
| might expect, a prefix token matches any document token of which it is a 
 | |
| prefix. For example, the first two queries in the following block will match
 | |
| any document that contains the token "one" immediately followed by the token
 | |
| "two" and then any token that begins with "thr".
 | |
| 
 | |
| </p><div class="codeblock"><pre>... MATCH '"one two thr" * '
 | |
| ... MATCH 'one + two + thr*'
 | |
| ... MATCH '"one two thr*"'      <b>-- May not work as expected!</b>
 | |
| </pre></div>
 | |
| 
 | |
| <p>The final query in the block above may not work as expected. Because the
 | |
| "*" character is inside the double-quotes, it will be passed to the tokenizer,
 | |
| which will likely discard it (or perhaps, depending on the specific tokenizer
 | |
| in use, include it as part of the final token) instead of recognizing it as
 | |
| a special FTS character.
 | |
| 
 | |
| <a name="carrotq"></a>
 | |
| 
 | |
| </p><a name="fts5_initial_token_queries"></a>
 | |
| <h2 tags="FTS5 initial token queries" id="fts5_initial_token_queries"><span>3.4. </span>FTS5 Initial Token Queries</h2>
 | |
| <p>
 | |
| If a "^" character appears immediately before a phrase that is not part of a
 | |
| NEAR query, then that phrase only matches a document only if it starts at the
 | |
| first token in a column. The "^" syntax may be combined with a 
 | |
| <a href="fts5.html#fts5_column_filters">column filter</a>, but may not be inserted into the middle of
 | |
| a phrase.
 | |
| 
 | |
| </p><div class="codeblock"><pre>... MATCH '^one'              <i>-- first token in any column must be "one"</i>
 | |
| ... MATCH '^ one + two'       <i>-- phrase "one two" must appear at start of a column</i>
 | |
| ... MATCH '^ "one two"'       <i>-- same as previous </i>
 | |
| ... MATCH 'a : ^two'          <i>-- first token of column "a" must be "two"</i>
 | |
| ... MATCH 'NEAR(^one, two)'   <b>-- syntax error! </b>
 | |
| ... MATCH 'one + ^two'        <b>-- syntax error! </b>
 | |
| ... MATCH '"^one two"'        <b>-- May not work as expected!</b>
 | |
| </pre></div>
 | |
| 
 | |
| <a name="fts5_near_queries"></a>
 | |
| <h2 tags="FTS5 NEAR queries" id="fts5_near_queries"><span>3.5. </span>FTS5 NEAR Queries</h2>
 | |
| 
 | |
| <p>Two or more phrases may be grouped into a <b>NEAR group</b>. A NEAR group
 | |
| is specified by the token "NEAR" (case sensitive) followed by an open
 | |
| parenthesis character, followed by two or more whitespace separated phrases, optionally followed by a comma and the numeric parameter <i>N</i>, followed by
 | |
| a close parenthesis. For example:
 | |
| 
 | |
| </p><div class="codeblock"><pre>... MATCH 'NEAR("one two" "three four", 10)'
 | |
| ... MATCH 'NEAR("one two" thr* + four)'
 | |
| </pre></div>
 | |
| 
 | |
| <p>If no <i>N</i> parameter is supplied, it defaults to 10. A NEAR group
 | |
| matches a document if the document contains at least one clump of tokens that: 
 | |
| 
 | |
| </p><ol> 
 | |
|   <li> contains at least one instance of each phrase, and 
 | |
|   </li><li> for which the number of tokens between the end of the first phrase 
 | |
|        and the beginning of the last phrase in the clump is less than or equal to <i>N</i>.
 | |
| </li></ol>
 | |
| 
 | |
| <p>For example:
 | |
| 
 | |
| </p><div class="codeblock"><pre>CREATE VIRTUAL TABLE f USING fts5(x);
 | |
| INSERT INTO f(rowid, x) VALUES(1, 'A B C D x x x E F x');
 | |
| 
 | |
| ... MATCH 'NEAR(e d, 4)';                      <i>-- Matches!</i>
 | |
| ... MATCH 'NEAR(e d, 3)';                      <i>-- Matches!</i>
 | |
| ... MATCH 'NEAR(e d, 2)';                      <i>-- Does not match!</i>
 | |
| 
 | |
| ... MATCH 'NEAR("c d" "e f", 3)';              <i>-- Matches!</i>
 | |
| ... MATCH 'NEAR("c"   "e f", 3)';              <i>-- Does not match!</i>
 | |
| 
 | |
| ... MATCH 'NEAR(a d e, 6)';                    <i>-- Matches!</i>
 | |
| ... MATCH 'NEAR(a d e, 5)';                    <i>-- Does not match!</i>
 | |
| 
 | |
| ... MATCH 'NEAR("a b c d" "b c" "e f", 4)';    <i>-- Matches!</i>
 | |
| ... MATCH 'NEAR("a b c d" "b c" "e f", 3)';    <i>-- Does not match!</i>
 | |
| </pre></div>
 | |
| 
 | |
| <a name="fts5_column_filters"></a>
 | |
| <h2 tags="FTS5 column filters" id="fts5_column_filters"><span>3.6. </span>FTS5 Column Filters</h2>
 | |
| 
 | |
| <p>
 | |
| A single phrase or NEAR group may be restricted to matching text within a
 | |
| specified column of the FTS table by prefixing it with the column name 
 | |
| followed by a colon character. Or to a set of columns by prefixing it
 | |
| with a whitespace separated list of column names enclosed in parenthesis
 | |
| ("curly brackets") followed by a colon character. Column names may be specified
 | |
| using either of the two forms described for strings above. Unlike strings that
 | |
| are part of phrases, column names are not passed to the tokenizer module.
 | |
| Column names are case-insensitive in the usual way for SQLite column names -
 | |
| upper/lower case equivalence is understood for ASCII-range characters only.
 | |
| 
 | |
| </p><div class="codeblock"><pre>... MATCH 'colname : NEAR("one two" "three four", 10)'
 | |
| ... MATCH '"colname" : one + two + three'
 | |
| 
 | |
| ... MATCH '{col1 col2} : NEAR("one two" "three four", 10)'
 | |
| ... MATCH '{col2 col1 col3} : one + two + three'
 | |
| </pre></div>
 | |
| 
 | |
| <p>
 | |
| If a column filter specification is preceded by a "-" character, then
 | |
| it is interpreted as a list of column not to match against. For example:
 | |
| 
 | |
| </p><div class="codeblock"><pre><i>-- Search for matches in all columns except "colname"</i>
 | |
| ... MATCH '- colname : NEAR("one two" "three four", 10)'
 | |
| 
 | |
| <i>-- Search for matches in all columns except "col1", "col2" and "col3"</i>
 | |
| ... MATCH '- {col2 col1 col3} : one + two + three'
 | |
| </pre></div>
 | |
| 
 | |
| <p>
 | |
| Column filter specifications may also be applied to arbitrary expressions
 | |
| enclosed in parenthesis. In this case the column filter applies to all 
 | |
| phrases within the expression. Nested column filter operations may only 
 | |
| further restrict the subset of columns matched, they can not be used to 
 | |
| re-enable filtered columns. For example:
 | |
| 
 | |
| </p><div class="codeblock"><pre><i>-- The following are equivalent:</i>
 | |
| ... MATCH '{a b} : ( {b c} : "hello" AND "world" )'
 | |
| ... MATCH '(b : "hello") AND ({a b} : "world")'
 | |
| </pre></div>
 | |
| 
 | |
| <p>
 | |
| Finally, a column filter for a single column may be specified by using
 | |
| the column name as the LHS of a MATCH operator (instead of the usual
 | |
| table name). For example:
 | |
| 
 | |
| </p><div class="codeblock"><pre><i>-- Given the following table</i>
 | |
| CREATE VIRTUAL TABLE ft USING fts5(a, b, c);
 | |
| 
 | |
| <i>-- The following are equivalent</i>
 | |
| SELECT * FROM ft WHERE b MATCH 'uvw AND xyz';
 | |
| SELECT * FROM ft WHERE ft MATCH 'b : (uvw AND xyz)';
 | |
| 
 | |
| <i>-- This query cannot match any rows (since all columns are filtered out): </i>
 | |
| SELECT * FROM ft WHERE b MATCH 'a : xyz';
 | |
| </pre></div>
 | |
| 
 | |
| <a name="fts5_boolean_operators"></a>
 | |
| <h2 tags="FTS5 boolean operators" id="fts5_boolean_operators"><span>3.7. </span>FTS5 Boolean Operators</h2>
 | |
| 
 | |
| <p>
 | |
| Phrases and NEAR groups may be arranged into expressions using <b>boolean
 | |
| operators</b>. In order of precedence, from highest (tightest grouping) to
 | |
| lowest (loosest grouping), the operators are:
 | |
| 
 | |
| </p><table striped="1" style="margin:1em auto; width:80%; border-spacing:0">
 | |
|   <tr style="text-align:left"><th>Operator </th><th>Function
 | |
| 
 | |
|   </th></tr><tr style="text-align:left;background-color:#DDDDDD"><td><code><query1> NOT <query2></code> 
 | |
|       </td><td>Matches if query1 matches and query2 does not match.
 | |
| 
 | |
|   </td></tr><tr style="text-align:left"><td><code><query1> AND <query2></code> 
 | |
|       </td><td>Matches if both query1 and query2 match.
 | |
| 
 | |
|   </td></tr><tr style="text-align:left;background-color:#DDDDDD"><td><code><query1> OR <query2></code> 
 | |
|       </td><td>Matches if either query1 or query2 match.
 | |
| 
 | |
| </td></tr></table>
 | |
| 
 | |
| <p>
 | |
| Parenthesis may be used to group expressions in order to modify operator
 | |
| precedence in the usual ways. For example:
 | |
| 
 | |
| </p><div class="codeblock"><pre><i>-- Matches documents that contain at least one instance of either "one"</i>
 | |
| <i>-- or "two", but do not contain any instances of token "three".</i>
 | |
| ... MATCH 'one OR two NOT three'
 | |
| 
 | |
| <i>-- Match all documents that contain the token "two" but not "three", or</i>
 | |
| <i>-- contain the token "one".</i>
 | |
| ... MATCH 'one OR (two NOT three)'
 | |
| </pre></div>
 | |
| 
 | |
| <p>
 | |
| Phrases and NEAR groups may also be connected by <b>implicit AND operators</b>.
 | |
| For simplicity, these are not shown in the BNF grammar above. Essentially, any
 | |
| sequence of phrases or NEAR groups (including those restricted to matching
 | |
| specified columns) separated only by whitespace are handled as if there were an
 | |
| implicit AND operator between each pair of phrases or NEAR groups. Implicit
 | |
| AND operators are never inserted after or before an expression enclosed in
 | |
| parenthesis. For example:
 | |
| 
 | |
| </p><div class="codeblock"><pre>... MATCH 'one two three'         <i>-- 'one AND two AND three'</i>
 | |
| ... MATCH 'three "one two"'       <i>-- 'three AND "one two"'</i>
 | |
| ... MATCH 'NEAR(one two) three'   <i>-- 'NEAR(one two) AND three'</i>
 | |
| ... MATCH 'one OR two three'      <i>-- 'one OR two AND three'</i>
 | |
| 
 | |
| ... MATCH '(one OR two) three'    <i>-- Syntax error!</i>
 | |
| ... MATCH 'func(one two)'         <i>-- Syntax error!</i>
 | |
| </pre></div>
 | |
| 
 | |
| <a name="fts5_table_creation_and_initialization"></a>
 | |
| <h1 tags="FTS5 CREATE TABLE Options" id="fts5_table_creation_and_initialization"><span>4. </span>FTS5 Table Creation and Initialization</h1>
 | |
| 
 | |
| <p>Each argument specified as part of a "CREATE VIRTUAL TABLE ... USING fts5 
 | |
| ..." statement is either a column declaration or a configuration option. A
 | |
| <b>column declaration</b> consists of one or more whitespace separated FTS5
 | |
| barewords or string literals quoted in any manner acceptable to SQLite.
 | |
| 
 | |
| </p><p>The first string or bareword in a column declaration is the column name. It
 | |
| is an error to attempt to name an fts5 table column "rowid" or "rank", or to
 | |
| assign the same name to a column as is used by the table itself. This is not
 | |
| supported.
 | |
| 
 | |
| </p><p>Each subsequent string or bareword in a column declaration is a column
 | |
| option that modifies the behaviour of that column. Column options are
 | |
| case-independent. Unlike the SQLite core, FTS5 considers unrecognized column
 | |
| options to be errors. Currently, the only option recognized is 
 | |
| <a href="fts5.html#the_unindexed_column_option">"UNINDEXED" (see below)</a>.
 | |
| 
 | |
| </p><p>A <b>configuration option</b> consists of an FTS5 bareword - the option name -
 | |
| followed by an "=" character, followed by the option value. The option value is
 | |
| specified using either a single FTS5 bareword or a string literal, again quoted
 | |
| in any manner acceptable to the SQLite core. For example:
 | |
| 
 | |
| </p><div class="codeblock"><pre>CREATE VIRTUAL TABLE mail USING fts5(sender, title, body, tokenize = 'porter ascii');
 | |
| </pre></div>
 | |
| 
 | |
| <p> There are currently the following configuration options:
 | |
| 
 | |
| </p><ul>
 | |
|   <li> The "tokenize" option, used to configure a <a href="fts5.html#tokenizers">custom tokenizer</a>.
 | |
|   </li><li> The "prefix" option, used to add <a href="fts5.html#prefix_indexes">prefix indexes</a>
 | |
|        to an FTS5 table.
 | |
|   </li><li> The "content" option, used to make the FTS5 table an 
 | |
|        <a href="fts5.html#external_content_and_contentless_tables">external content or contentless table</a>.
 | |
|   </li><li> The "content_rowid" option, used to set the rowid field of an 
 | |
|        <a href="fts5.html#external_content_tables">external content table</a>.
 | |
|   </li><li> The <a href="fts5.html#the_columnsize_option">"columnsize" option</a>, used to configure
 | |
|        whether or not the size in tokens of each value in the FTS5 table is
 | |
|        stored separately within the database.
 | |
|   </li><li> The <a href="fts5.html#the_detail_option">"detail" option</a>. This option may be used 
 | |
|        to reduce the size of the FTS index on disk by omitting some information
 | |
|        from it.  
 | |
| </li></ul>
 | |
| 
 | |
| <a name="the_unindexed_column_option"></a>
 | |
| <h2 tags="unindexed" id="the_unindexed_column_option"><span>4.1. </span>The UNINDEXED column option</h2>
 | |
| 
 | |
| <p>The contents of columns qualified with the UNINDEXED column option are not
 | |
| added to the FTS index. This means that for the purposes of MATCH queries and
 | |
| <a href="fts5.html#_auxiliary_functions_">FTS5 auxiliary functions</a>, the column contains no matchable tokens. 
 | |
| 
 | |
| </p><p>For example, to avoid adding the contents of the "uuid" field to the FTS
 | |
| index:
 | |
| </p><div class="codeblock"><pre>CREATE VIRTUAL TABLE customers USING fts5(name, addr, uuid UNINDEXED);
 | |
| </pre></div>
 | |
| 
 | |
| <a name="prefix_indexes"></a>
 | |
| <h2 tags="FTS5 prefix indexes" id="prefix_indexes"><span>4.2. </span>Prefix Indexes</h2>
 | |
| 
 | |
| <p> By default, FTS5 maintains a single index recording the location of each
 | |
| token instance within the document set. This means that querying for complete
 | |
| tokens is fast, as it requires a single lookup, but querying for a prefix 
 | |
| token can be slow, as it requires a range scan. For example, to query for
 | |
| the prefix token "abc*" requires a range scan of all tokens greater than
 | |
| or equal to "abc" and less than "abd".
 | |
| 
 | |
| </p><p> A prefix index is a separate index that records the location of all
 | |
| instances of prefix tokens of a certain length in characters used to speed
 | |
| up queries for prefix tokens. For example, optimizing a query for prefix
 | |
| token "abc*" requires a prefix index of three-character prefixes.
 | |
| 
 | |
| </p><p> To add prefix indexes to an FTS5 table, the "prefix" option is set to
 | |
| either a single positive integer or a text value containing a white-space
 | |
| separated list of one or more positive integer values. A prefix index is
 | |
| created for each integer specified. If more than one "prefix" option is
 | |
| specified as part of a single CREATE VIRTUAL TABLE statement, all apply.
 | |
| 
 | |
| </p><div class="codeblock"><pre><i>-- Two ways to create an FTS5 table that maintains prefix indexes for
 | |
| -- two and three character prefix tokens.</i>
 | |
| CREATE VIRTUAL TABLE ft USING fts5(a, b, prefix='2 3');
 | |
| CREATE VIRTUAL TABLE ft USING fts5(a, b, prefix=2, prefix=3);
 | |
| </pre></div>
 | |
| 
 | |
| <a name="tokenizers"></a>
 | |
| <h2 tags="FTS5 tokenizers" id="tokenizers"><span>4.3. </span>Tokenizers</h2>
 | |
| 
 | |
| <p> The CREATE VIRTUAL TABLE "tokenize" option is used to configure the
 | |
| specific tokenizer used by the FTS5 table. The option argument must be either
 | |
| an FTS5 bareword, or an SQL text literal. The text of the argument is itself
 | |
| treated as a white-space series of one or more FTS5 barewords or SQL text
 | |
| literals. The first of these is the name of the tokenizer to use. The second
 | |
| and subsequent list elements, if they exist, are arguments passed to the
 | |
| tokenizer implementation.
 | |
| 
 | |
| </p><p> Unlike option values and column names, SQL text literals intended as
 | |
| tokenizers must be quoted using single quote characters. For example:
 | |
| 
 | |
| </p><div class="codeblock"><pre><i>-- The following are all equivalent</i>
 | |
| CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = 'porter ascii');
 | |
| CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = "porter ascii");
 | |
| CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = "'porter' 'ascii'");
 | |
| CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = '''porter'' ''ascii''');
 | |
| 
 | |
| <i>-- But this will fail:</i>
 | |
| CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = '"porter" "ascii"');
 | |
| 
 | |
| <i>-- This will fail too:</i>
 | |
| CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = 'porter' 'ascii');
 | |
| </pre></div>
 | |
| 
 | |
| 
 | |
| <p>
 | |
| FTS5 features three built-in tokenizer modules, described in subsequent
 | |
| sections:
 | |
| 
 | |
| </p><ul>
 | |
|   <li> The <b>unicode61</b> tokenizer, based on the Unicode 6.1 standard. This
 | |
|        is the default.
 | |
| 
 | |
|   </li><li> The <b>ascii</b> tokenizer, which assumes all characters outside of
 | |
|   the ASCII codepoint range (0-127) are to be treated as token characters.
 | |
| 
 | |
|   </li><li> The <b>porter</b> tokenizer, which implements the 
 | |
| <a href="http://tartarus.org/martin/PorterStemmer/">porter stemming algorithm</a>.
 | |
| </li></ul>
 | |
| 
 | |
| <p> It is also possible to create custom tokenizers for FTS5. The API for doing so is <a href="fts5.html#custom_tokenizers">described here</a>.
 | |
| 
 | |
| </p><h3 id="unicode61_tokenizer"><span>4.3.1. </span>Unicode61 Tokenizer</h3>
 | |
| 
 | |
| <p> The unicode tokenizer classifies all unicode characters as either 
 | |
| "separator" or "token" characters. By default all space and punctuation
 | |
| characters, as defined by Unicode 6.1, are considered separators, and all 
 | |
| other characters as token characters. More specifically, all unicode 
 | |
| characters assigned to a 
 | |
| <a href="https://en.wikipedia.org/wiki/Unicode_character_property#General_Category">
 | |
| general category</a> beginning with "L" or "N" (letters and numbers,
 | |
| specifically) or to category "Co" ("other, private use") are considered tokens.
 | |
| All other characters are separators.
 | |
|  
 | |
| </p><p>Each contiguous run of one or more token characters is considered to be a
 | |
| token. The tokenizer is case-insensitive according to the rules defined by
 | |
| Unicode 6.1.
 | |
| 
 | |
| </p><p> By default, diacritics are removed from all Latin script characters. This
 | |
| means, for example, that "A", "a", "À", "à", "Â" and "â"
 | |
| are all considered to be equivalent.
 | |
| 
 | |
| </p><p> Any arguments following "unicode61" in the token specification are treated
 | |
| as a list of alternating option names and values. Unicode61 supports the
 | |
| following options:
 | |
| 
 | |
| </p><table striped="1" style="margin:1em auto; width:80%; border-spacing:0">
 | |
|   <tr style="text-align:left"><th> Option </th><th> Usage
 | |
|   </th></tr><tr style="text-align:left;background-color:#DDDDDD"><td> remove_diacritics
 | |
|   </td><td>This option should be set to "0", "1" or "2". The default value is "1".
 | |
|   If it is set to "1" or "2", then diacritics are removed from Latin script
 | |
|   characters as described above. However, if it is set to "1", then diacritics
 | |
|   are not removed in the fairly uncommon case where a single unicode codepoint
 | |
|   is used to represent a character with more that one diacritic. For example,
 | |
|   diacritics are not removed from codepoint 0x1ED9 ("LATIN SMALL LETTER O WITH
 | |
|   CIRCUMFLEX AND DOT BELOW"). This is technically a bug, but cannot be fixed
 | |
|   without creating backwards compatibility problems. If this option is set to
 | |
|   "2", then diacritics are correctly removed from all Latin characters.
 | |
| 
 | |
|   </td></tr><tr style="text-align:left"><td> categories
 | |
|   </td><td>This option may be used to modify the set of Unicode general categories
 | |
|   that are considered to correspond to token characters. The argument must
 | |
|   consist of a space separated list of two-character general category
 | |
|   abbreviations (e.g. "Lu" or "Nd"), or of the same with the second character
 | |
|   replaced with an asterisk ("*"), interpreted as a glob pattern. The default
 | |
|   value is "L* N* Co".
 | |
| 
 | |
|   </td></tr><tr style="text-align:left;background-color:#DDDDDD"><td> tokenchars
 | |
|   </td><td> This option is used to specify additional unicode characters that 
 | |
|   should be considered token characters, even if they are white-space or
 | |
|   punctuation characters according to Unicode 6.1. All characters in the
 | |
|   string that this option is set to are considered token characters.
 | |
| 
 | |
|   </td></tr><tr style="text-align:left"><td> separators
 | |
|   </td><td> This option is used to specify additional unicode characters that 
 | |
|   should be considered as separator characters, even if they are token
 | |
|   characters according to Unicode 6.1. All characters in the string that 
 | |
|   this option is set to are considered separators.
 | |
| </td></tr></table>
 | |
| 
 | |
| <p> For example:
 | |
| 
 | |
| </p><div class="codeblock"><pre><i>-- Create an FTS5 table that does not remove diacritics from Latin
 | |
| -- script characters, and that considers hyphens and underscore characters
 | |
| -- to be part of tokens. </i>
 | |
| CREATE VIRTUAL TABLE ft USING fts5(a, b, 
 | |
|     tokenize = "unicode61 remove_diacritics 0 tokenchars '-_'"
 | |
| );
 | |
| </pre></div>
 | |
| 
 | |
| <p> or:
 | |
| 
 | |
| </p><div class="codeblock"><pre><i>-- Create an FTS5 table that, as well as the default token character classes,</i>
 | |
| <i>-- considers characters in class "Mn" to be token characters.</i>
 | |
| CREATE VIRTUAL TABLE ft USING fts5(a, b, 
 | |
|     tokenize = "unicode61 categories 'L* N* Co Mn'"
 | |
| );
 | |
| </pre></div>
 | |
| 
 | |
| <p> The fts5 unicode61 tokenizer is byte-for-byte compatible with the fts3/4
 | |
| unicode61 tokenizer.
 | |
| 
 | |
| </p><h3 id="ascii_tokenizer"><span>4.3.2. </span>Ascii Tokenizer</h3>
 | |
| 
 | |
| <p> The Ascii tokenizer is similar to the Unicode61 tokenizer, except that:
 | |
| 
 | |
| </p><ul>
 | |
|   <li> All non-ASCII characters (those with codepoints greater than 127) are
 | |
|   always considered token characters. If any non-ASCII characters are specified
 | |
|   as part of the separators option, they are ignored.  
 | |
| 
 | |
|   </li><li> Case-folding is only performed for ASCII characters. So while "A" and
 | |
|   "a" are considered to be equivalent, "Ã" and "ã" are distinct.
 | |
| 
 | |
|   </li><li> The remove_diacritics option is not supported.
 | |
| </li></ul>
 | |
| 
 | |
| <p> For example:
 | |
| 
 | |
| </p><div class="codeblock"><pre><i>-- Create an FTS5 table that uses the ascii tokenizer, but does not
 | |
| -- consider numeric characters to be part of tokens.</i>
 | |
| CREATE VIRTUAL TABLE ft USING fts5(a, b, 
 | |
|     tokenize = "ascii separators '0123456789'"
 | |
| );
 | |
| </pre></div>
 | |
| 
 | |
| <h3 id="porter_tokenizer"><span>4.3.3. </span>Porter Tokenizer</h3>
 | |
| 
 | |
| <p> The porter tokenizer is a wrapper tokenizer. It takes the output of some
 | |
| other tokenizer and applies the 
 | |
| <a href="http://tartarus.org/martin/PorterStemmer/">porter stemming algorithm</a>
 | |
| to each token before it returns it to FTS5. This allows search terms like
 | |
| "correction" to match similar words such as "corrected" or "correcting". The
 | |
| porter stemmer algorithm is designed for use with English language terms 
 | |
| only - using it with other languages may or may not improve search utility.
 | |
| 
 | |
| </p><p> By default, the porter tokenizer operates as a wrapper around the default
 | |
| tokenizer (unicode61). Or, if one or more extra arguments are added to the
 | |
| "tokenize" option following "porter", they are treated as a specification for
 | |
| the underlying tokenizer that the porter stemmer uses. For example:
 | |
| 
 | |
| </p><div class="codeblock"><pre><i>-- Two ways to create an FTS5 table that uses the porter tokenizer to
 | |
| -- stem the output of the default tokenizer (unicode61). </i>
 | |
| CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = porter); 
 | |
| CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = 'porter unicode61');
 | |
| 
 | |
| <i>-- A porter tokenizer used to stem the output of the unicode61 tokenizer,
 | |
| -- with diacritics removed before stemming.</i>
 | |
| CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = 'porter unicode61 remove_diacritics 1');
 | |
| </pre></div>
 | |
| 
 | |
| <a name="external_content_and_contentless_tables"></a>
 | |
| <h2 tags="FTS5 content option" id="external_content_and_contentless_tables"><span>4.4. </span>External Content and Contentless Tables</h2>
 | |
| 
 | |
| <p>
 | |
| Normally, when a row is inserted into an FTS5 table, as well as the various
 | |
| full-text index entries and other data a copy of the row is stored in a private
 | |
| table managed by the FTS5 module. When column values are requested from the
 | |
| FTS5 table by the user or by an auxiliary function implementation, they are
 | |
| read from this private table. The "content" option may be used to create an
 | |
| FTS5 table that stores only FTS full-text index entries. Because the column
 | |
| values themselves are usually much larger than the associated full-text index
 | |
| entries, this can save significant database space.
 | |
| 
 | |
| </p><p>
 | |
| There are two ways to use the "content" option:
 | |
| </p><ul>
 | |
|   <li> By setting it to an empty string to create a contentless FTS5 table. In
 | |
|        this case FTS5 assumes that the original column values are unavailable
 | |
|        to it when processing queries. Full-text queries and some auxiliary
 | |
|        functions can still be used, but no column values apart from the rowid
 | |
|        may be read from the table.
 | |
| 
 | |
|   </li><li> By setting it to the name of a database object (table, virtual table or
 | |
|        view) that may be queried by FTS5 at any time to retrieve the column
 | |
|        values. This is known as an "external content" table. In this case all
 | |
|        FTS5 functionality may be used, but it is the responsibility of the user
 | |
|        to ensure that the contents of the full-text index are consistent with
 | |
|        the named database object. If they are not, query results may be
 | |
|        unpredictable.  
 | |
| </li></ul>
 | |
| 
 | |
| <a name="contentless_tables"></a>
 | |
| <h3 tags="FTS5 contentless tables" id="contentless_tables"><span>4.4.1. </span>Contentless Tables</h3>
 | |
| 
 | |
| <p> A contentless FTS5 table is created by setting the "content" option to
 | |
| an empty string. For example:
 | |
| 
 | |
| </p><div class="codeblock"><pre>CREATE VIRTUAL TABLE f1 USING fts5(a, b, c, content='');
 | |
| </pre></div>
 | |
| 
 | |
| <p> Contentless FTS5 tables do not support UPDATE or DELETE statements, or
 | |
| INSERT statements that do not supply a non-NULL value for the rowid field.
 | |
| Contentless tables do not support REPLACE conflict handling. REPLACE 
 | |
| and INSERT OR REPLACE statements are treated as regular INSERT statements.
 | |
| Rows may be deleted from a contentless table using an <a href="fts5.html#the_delete_command">FTS5 delete command</a>.
 | |
| 
 | |
| </p><p> Attempting to read any column value except the rowid from a contentless
 | |
| FTS5 table returns an SQL NULL value.
 | |
| 
 | |
| </p><a name="external_content_tables"></a>
 | |
| <h3 tags="FTS5 external content tables" id="external_content_tables"><span>4.4.2. </span>External Content Tables</h3>
 | |
| 
 | |
| <p> An external content FTS5 table is created by setting the content 
 | |
| option to the name of a table, virtual table or view (hereafter the "content
 | |
| table") within the same database. Whenever column values are required by
 | |
| FTS5, it queries the content table as follows, with the rowid of the row
 | |
| for which values are required bound to the SQL variable:
 | |
| 
 | |
| </p><div class="codeblock"><pre>SELECT <content_rowid>, <cols> FROM <content> WHERE <content_rowid> = ?;
 | |
| </pre></div>
 | |
| 
 | |
| <p> In the above, <content> is replaced by the name of the content table.
 | |
| By default, <content_rowid> is replaced by the literal text "rowid". Or,
 | |
| if the "content_rowid" option is set within the CREATE VIRTUAL TABLE statement,
 | |
| by the value of that option. <cols> is replaced by a comma-separated list
 | |
| of the FTS5 table column names. For example:
 | |
| 
 | |
| </p><div class="codeblock"><pre><i>-- If the database schema is: </i>
 | |
| CREATE TABLE tbl (a, b, c, d INTEGER PRIMARY KEY);
 | |
| CREATE VIRTUAL TABLE fts USING fts5(a, c, content=tbl, content_rowid=d);
 | |
| 
 | |
| <i>-- Fts5 may issue queries such as:</i>
 | |
| SELECT d, a, c FROM tbl WHERE d = ?;
 | |
| </pre></div>
 | |
| 
 | |
| <p> The content table may also be queried as follows:
 | |
| 
 | |
| </p><div class="codeblock"><pre>SELECT <content_rowid>, <cols> FROM <content> ORDER BY <content_rowid> ASC;
 | |
| SELECT <content_rowid>, <cols> FROM <content> ORDER BY <content_rowid> DESC;
 | |
| </pre></div>
 | |
| 
 | |
| <p> It is still the responsibility of the user to ensure that the contents of
 | |
| an external content FTS5 table are kept up to date with the content table. 
 | |
| One way to do this is with triggers. For example:
 | |
| 
 | |
| </p><div class="codeblock"><pre><i>-- Create a table. And an external content fts5 table to index it.</i>
 | |
| CREATE TABLE tbl(a INTEGER PRIMARY KEY, b, c);
 | |
| CREATE VIRTUAL TABLE fts_idx USING fts5(b, c, content='tbl', content_rowid='a');
 | |
| 
 | |
| <i>-- Triggers to keep the FTS index up to date.</i>
 | |
| CREATE TRIGGER tbl_ai AFTER INSERT ON tbl BEGIN
 | |
|   INSERT INTO fts_idx(rowid, b, c) VALUES (new.a, new.b, new.c);
 | |
| END;
 | |
| CREATE TRIGGER tbl_ad AFTER DELETE ON tbl BEGIN
 | |
|   INSERT INTO fts_idx(fts_idx, rowid, b, c) VALUES('delete', old.a, old.b, old.c);
 | |
| END;
 | |
| CREATE TRIGGER tbl_au AFTER UPDATE ON tbl BEGIN
 | |
|   INSERT INTO fts_idx(fts_idx, rowid, b, c) VALUES('delete', old.a, old.b, old.c);
 | |
|   INSERT INTO fts_idx(rowid, b, c) VALUES (new.a, new.b, new.c);
 | |
| END;
 | |
| </pre></div>
 | |
| 
 | |
| <p> Like contentless tables, external content tables do not support REPLACE
 | |
| conflict handling. Any operations that specify REPLACE conflict handling are
 | |
| handled using ABORT.
 | |
| 
 | |
| </p><a name="the_columnsize_option"></a>
 | |
| <h2 tags="FTS5 columnsize option" id="the_columnsize_option"><span>4.5. </span>The Columnsize Option</h2>
 | |
| 
 | |
| <p>Normally, FTS5 maintains a special backing table within the database that
 | |
| stores the size of each column value in tokens inserted into the main FTS5
 | |
| table in a separate table. This backing table is used by the
 | |
| <a href="#xColumnSize">xColumnSize</a><a> API function, which is in turn used by
 | |
| the built-in <a href="fts5.html#the_bm25_function">bm25 ranking function</a> (and is likely to be useful 
 | |
| to other ranking functions as well).
 | |
| 
 | |
| </a></p><p>In order to save space, this backing table may be omitted by setting the
 | |
| columnsize option to zero. For example:
 | |
| 
 | |
| </p><div class="codeblock"><pre><i>-- A table without the xColumnSize() values stored on disk:</i>
 | |
| CREATE VIRTUAL TABLE ft USING fts5(a, b, c, columnsize=0);
 | |
| 
 | |
| <i>-- Three equivalent ways of creating a table that does store the</i>
 | |
| <i>-- xColumnSize() values on disk:</i>
 | |
| CREATE VIRTUAL TABLE ft USING fts5(a, b, c);
 | |
| CREATE VIRTUAL TABLE ft USING fts5(a, b, c, columnsize=1);
 | |
| CREATE VIRTUAL TABLE ft USING fts5(a, b, columnsize='1', c);
 | |
| </pre></div>
 | |
| 
 | |
| <p> It is an error to set the columnsize option to any value other than
 | |
| 0 or 1.
 | |
| 
 | |
| </p><p> If an FTS5 table is configured with columnsize=0 but is not a
 | |
| <a href="fts5.html#contentless_tables">contentless table</a>, the xColumnSize API function
 | |
| still works, but runs much more slowly. In this case, instead of reading
 | |
| the value to return directly from the database, it reads the text value
 | |
| itself and count the tokens within it on demand.
 | |
| 
 | |
| </p><p>Or, if the table is also a <a href="fts5.html#contentless_tables">contentless table</a>,
 | |
| then the following apply:
 | |
| 
 | |
| </p><ul>
 | |
|   <li> <p>The xColumnSize API always returns -1. There is no way to determine 
 | |
|        the number of tokens in a value stored within a contentless FTS5 table
 | |
|        configured with columnsize=0.
 | |
| 
 | |
|   </p></li><li> <p>Each inserted row must be accompanied by an explicitly specified rowid
 | |
|        value. If a contentless table is configured with columnsize=0,
 | |
|        attempting to insert a NULL value into the rowid is an SQLITE_MISMATCH
 | |
|        error.
 | |
| 
 | |
|   </p></li><li> <p>All queries on the table must be full-text queries. In other words,
 | |
|        they must use the MATCH or = operator with the table-name column as the
 | |
|        left-hand operand, or else use the table-valued function syntax. Any
 | |
|        query that is not a full-text query results in an error.
 | |
| </p></li></ul>
 | |
| 
 | |
| <p> The name of the table in which the xColumnSize values are stored
 | |
| (unless columnsize=0 is specified) is "<name>_docsize", where 
 | |
| <name> is the name of the FTS5 table itself. The 
 | |
| <a href="https://www.sqlite.org/download.html">sqlite3_analyzer</a>
 | |
| tool may be used on an existing database in order to determine how much
 | |
| space might be saved by recreating an FTS5 table using columnsize=0.
 | |
| 
 | |
| </p><a name="the_detail_option"></a>
 | |
| <h2 tags="FTS5 detail option" id="the_detail_option"><span>4.6. </span>The Detail Option</h2>
 | |
| 
 | |
| <p> For each term in a document, the FTS index maintained by FTS5 
 | |
| stores the rowid of the document, the column number of the column that contains
 | |
| the term and the offset of the term within the column value. The "detail"
 | |
| option may be used to omit some of this information. This reduces the space
 | |
| that the index consumes within the database file, but also reduces the
 | |
| capability and efficiency of the system.
 | |
| 
 | |
| </p><p> The detail option may be set to "full" (the default value), "column" or
 | |
| "none". For example:
 | |
| 
 | |
| </p><div class="codeblock"><pre><i>-- The following two lines are equivalent (because the default value</i>
 | |
| <i>-- of "detail" is "full". </i>
 | |
| CREATE VIRTUAL TABLE ft1 USING fts5(a, b, c);
 | |
| CREATE VIRTUAL TABLE ft1 USING fts5(a, b, c, detail=full);
 | |
| 
 | |
| CREATE VIRTUAL TABLE ft2 USING fts5(a, b, c, detail=column);
 | |
| CREATE VIRTUAL TABLE ft3 USING fts5(a, b, c, detail=none);
 | |
| </pre></div>
 | |
| 
 | |
| <p>If the detail option is set to <b>column</b>, then for each term the FTS
 | |
| index records the rowid and column number only, omitting the term offset
 | |
| information. This results in the following restrictions:
 | |
| 
 | |
| </p><ul>
 | |
|   <li> NEAR queries are not available.
 | |
|   </li><li> Phrase queries are not available.
 | |
|   </li><li> Assuming the table is not also a 
 | |
|   <a href="fts5.html#contentless_tables">contentless table</a>, the 
 | |
|   <a href="#xInstCount">xInstCount</a>, <a href="#xInst">xInst</a>, 
 | |
|   <a href="#xPhraseFirst">xPhraseFirst</a> and <a href="#xPhraseNext">xPhraseNext</a>
 | |
|   are slower than usual. This is because instead of reading the required data
 | |
|   directly from the FTS index they have to load and tokenize the document text 
 | |
|   on demand.
 | |
|   </li><li> If the table is also a contentless table, the xInstCount, xInst, 
 | |
|   xPhraseFirst and xPhraseNext APIs behave as if the current row contains no
 | |
|   phrase matches at all (i.e. xInstCount() returns 0).
 | |
| </li></ul>
 | |
|   
 | |
| <p>If the detail option is set to <b>none</b>, then for each term the FTS
 | |
| index records just the rowid is stored. Both column and offset information
 | |
| are omitted. As well as the restrictions itemized above for detail=column
 | |
| mode, this imposes the following extra limitations:
 | |
| 
 | |
| </p><ul>
 | |
|   <li> Column filter queries are not available.
 | |
|   </li><li> Assuming the table is not also a contentless table, the 
 | |
|   <a href="#xPhraseFirstColumn">xPhraseFirstColumn</a> and 
 | |
|   <a href="#xPhraseNextColumn">xPhraseNextColumn</a> are slower than usual. 
 | |
| 
 | |
|   </li><li> If the table is also a contentless table, the xPhraseFirstColumn and
 | |
|   xPhraseNextColumn APIs behave as if the current row contains no phrase
 | |
|   matches at all (i.e. xPhraseFirstColumn() sets the iterator to EOF).
 | |
| </li></ul>
 | |
| 
 | |
| <p> In one test that indexed a large set of emails (1636 MiB on disk), the FTS
 | |
| index was 743 MiB on disk with detail=full, 340 MiB with detail=column and 134
 | |
| MiB with detail=none.
 | |
| 
 | |
| </p><a name="_auxiliary_functions_"></a>
 | |
| <h1 tags="FTS5 auxiliary functions" id="_auxiliary_functions_"><span>5. </span> Auxiliary Functions </h1>
 | |
| 
 | |
| <p> Auxiliary functions are similar to <a href="lang_corefunc.html">SQL scalar functions</a>,
 | |
| except that they may only be used within full-text queries (those that use
 | |
| the MATCH operator) on an FTS5 table. Their results are calculated based not
 | |
| only on the arguments passed to them, but also on the current match and 
 | |
| matched row. For example, an auxiliary function may return a numeric value
 | |
| indicating the accuracy of the match (see the <a href="fts5.html#the_bm25_function">bm25()</a> function), 
 | |
| or a fragment of text from the matched row that contains one or more 
 | |
| instances of the search terms (see the <a href="fts5.html#the_snippet_function">snippet()</a> function).
 | |
| 
 | |
| </p><p>To invoke an auxiliary function, the name of the FTS5 table should be
 | |
| specified as the first argument. Other arguments may follow the first,
 | |
| depending on the specific auxiliary function being invoked. For example, to
 | |
| invoke the "highlight" function:
 | |
| 
 | |
| </p><div class="codeblock"><pre>SELECT highlight(email, 2, '<b>', '</b>') FROM email WHERE email MATCH 'fts5'
 | |
| </pre></div>
 | |
| 
 | |
| <p>The built-in auxiliary functions provided as part of FTS5 are described in
 | |
| the following section. Applications may also implement 
 | |
| <a href="fts5.html#custom_auxiliary_functions">custom auxiliary functions in C</a>.
 | |
| 
 | |
| </p><h2 id="built_in_auxiliary_functions"><span>5.1. </span>Built-in Auxiliary Functions</h2>
 | |
| 
 | |
| <p> FTS5 provides three built-in auxiliary functions:
 | |
| 
 | |
| </p><ul>
 | |
|   <li> The <a href="fts5.html#the_bm25_function">bm25() auxiliary function</a> returns a real value
 | |
|        reflecting the accuracy of the current match. Better matches are
 | |
|        assigned numerically lower values.
 | |
| 
 | |
|   </li><li> The <a href="fts5.html#the_highlight_function">highlight() auxiliary function</a> returns a copy
 | |
|        of the text from one of the columns of the current match with each
 | |
|        instance of a queried term within the result surrounded by specified
 | |
|        markup (for example "<b>" and "</b>").
 | |
| 
 | |
|   </li><li> The <a href="fts5.html#the_snippet_function">snippet() auxiliary function</a> selects a short
 | |
|        fragment of text from one of the columns of the matched row and returns
 | |
|        it with each instance of a queried term surrounded by markup in
 | |
|        the same manner as the highlight() function. The fragment of text is
 | |
|        selected so as to maximize the number of queried terms it contains.
 | |
| </li></ul>
 | |
| 
 | |
| <a name="the_bm25_function"></a>
 | |
| <h3 tags="FTS5 bm25" id="the_bm25_function"><span>5.1.1. </span>The bm25() function</h3>
 | |
| 
 | |
| <p> The built-in auxiliary function bm25() returns a real value indicating
 | |
| how well the current row matches the full-text query. The better the match,
 | |
| the numerically smaller the value returned. A query such as the following may
 | |
| be used to return matches in order from best to worst match:
 | |
| 
 | |
| </p><div class="codeblock"><pre>SELECT * FROM fts WHERE fts MATCH ? ORDER BY bm25(fts)
 | |
| </pre></div>
 | |
| 
 | |
| <p> In order to calculate a documents score, the full-text query is separated
 | |
|     into its component phrases. The bm25 score for document <i>D</i> and 
 | |
|     query <i>Q</i> is then calculated as follows:
 | |
| 
 | |
| </p><p> <img src="images/fts5_formula1.png" style="width:75ex;margin-left:5ex">
 | |
| 
 | |
| </p><p> In the above, <i>nPhrase</i> is the number of phrases in the query.
 | |
|     <i>|D|</i> is the number of tokens in the current document, and
 | |
|     <i>avgdl</i> is the average number of tokens in all documents within the
 | |
|     FTS5 table.  <i>k<sub>1</sub></i> and <i>b</i> are both constants,
 | |
|     hard-coded at 1.2 and 0.75 respectively.
 | |
| 
 | |
| </p><p> The "-1" term at the start of the formula is not found in most
 | |
| implementations of the BM25 algorithm. Without it, a better match is assigned
 | |
| a numerically higher BM25 score. Since the default sorting order is
 | |
| "ascending", this means that appending "ORDER BY bm25(fts)" to a query would
 | |
| cause results to be returned in order from worst to best. The "DESC" keyword
 | |
| would be required in order to return the best matches first. In order to
 | |
| avoid this pitfall, the FTS5 implementation of BM25 multiplies the result
 | |
| by -1 before returning it, ensuring that better matches are assigned
 | |
| numerically lower scores.
 | |
| 
 | |
| </p><p> <i>IDF(q<sub>i</sub>)</i> is the inverse-document-frequency of query 
 | |
|     phrase <i>i</i>. It is calculated as follows, where <i>N</i> is the total
 | |
|     number of rows in the FTS5 table and <i>n(q<sub>i</sub>)</i> is the total
 | |
|     number of rows that contain at least one instance of phrase <i>i</i>:
 | |
| 
 | |
| </p><p> <img src="images/fts5_formula2.png" style="width:75ex;margin-left:5ex">
 | |
| 
 | |
| </p><p> Finally, <i>f(q<sub>i</sub>,D)</i> is the phrase frequency of phrase 
 | |
| <i>i</i>. By default, this is simply the number of occurrences of the phrase
 | |
| within the current row. However, by passing extra real value arguments to 
 | |
| the bm25() SQL function, each column of the table may be assigned a different
 | |
| weight and the phrase frequency calculated as follows:
 | |
| 
 | |
| </p><p> <img src="images/fts5_formula3.png" style="width:75ex;margin-left:5ex">
 | |
| 
 | |
| </p><p> where <i>w<sub>c</sub></i> is the weight assigned to column <i>c</i> and
 | |
| <i>n(q<sub>i</sub>,c)</i> is the number of occurrences of phrase <i>i</i> in
 | |
| column <i>c</i> of the current row. The first argument passed to bm25()
 | |
| following the table name is the weight assigned to the leftmost column of
 | |
| the FTS5 table. The second is the weight assigned to the second leftmost
 | |
| column, and so on. If there are not enough arguments for all table columns,
 | |
| remaining columns are assigned a weight of 1.0. If there are too many 
 | |
| trailing arguments, the extras are ignored. For example:
 | |
| 
 | |
| </p><div class="codeblock"><pre><i>-- Assuming the following schema:</i>
 | |
| CREATE VIRTUAL TABLE email USING fts5(sender, title, body);
 | |
| 
 | |
| <i>-- Return results in bm25 order, with each phrase hit in the "sender"</i>
 | |
| <i>-- column considered the equal of 10 hits in the "body" column, and</i>
 | |
| <i>-- each hit in the "title" column considered as valuable as 5 hits in</i>
 | |
| <i>-- the "body" column.</i>
 | |
| SELECT * FROM email WHERE email MATCH ? ORDER BY bm25(email, 10.0, 5.0);
 | |
| </pre></div>
 | |
| 
 | |
| <p>Refer to wikipedia for 
 | |
| <a href="http://en.wikipedia.org/wiki/Okapi_BM25">more information regarding
 | |
| BM25</a> and its variants.
 | |
| 
 | |
| </p><a name="the_highlight_function"></a>
 | |
| <h3 tags="FTS5 highlight" id="the_highlight_function"><span>5.1.2. </span>The highlight() function</h3>
 | |
| 
 | |
| <p> The highlight() function returns a copy of the text from a specified 
 | |
| column of the current row with extra markup text inserted to mark the start 
 | |
| and end of phrase matches. 
 | |
| 
 | |
| </p><p>The highlight() must be invoked with exactly three arguments following 
 | |
| the table name. To be interpreted as follows:
 | |
| 
 | |
| </p><ol>
 | |
|   <li> An integer indicating the index of the FTS table column to read the 
 | |
|        text from. Columns are numbered from left to right starting at zero.
 | |
| 
 | |
|   </li><li> The text to insert before each phrase match.
 | |
| 
 | |
|   </li><li> The text to insert after each phrase match.
 | |
| </li></ol>
 | |
| 
 | |
| <p>For example:
 | |
| 
 | |
| </p><div class="codeblock"><pre><i>-- Return a copy of the text from the leftmost column of the current</i>
 | |
| <i>-- row, with phrase matches marked using html "b" tags.</i>
 | |
| SELECT highlight(fts, 0, '<b>', '</b>') FROM fts WHERE fts MATCH ?
 | |
| </pre></div>
 | |
| 
 | |
| <p>In cases where two or more phrase instances overlap (share one or more
 | |
| tokens in common), a single open and close marker is inserted for each set
 | |
| of overlapping phrases. For example:
 | |
| 
 | |
| </p><div class="codeblock"><pre><i>-- Assuming this:</i>
 | |
| CREATE VIRTUAL TABLE ft USING fts5(a);
 | |
| INSERT INTO ft VALUES('a b c x c d e');
 | |
| INSERT INTO ft VALUES('a b c c d e');
 | |
| INSERT INTO ft VALUES('a b c d e');
 | |
| 
 | |
| <i>-- The following SELECT statement returns these three rows:</i>
 | |
| <i>--   '[a b c] x [c d e]'</i>
 | |
| <i>--   '[a b c] [c d e]'</i>
 | |
| <i>--   '[a b c d e]'</i>
 | |
| SELECT highlight(ft, 0, '[', ']') FROM ft WHERE ft MATCH 'a+b+c AND c+d+e';
 | |
| </pre></div>
 | |
| 
 | |
| <a name="the_snippet_function"></a>
 | |
| <h3 tags="FTS5 snippet" id="the_snippet_function"><span>5.1.3. </span>The snippet() function</h3>
 | |
| 
 | |
| <p>The snippet() function is similar to highlight(), except that instead of
 | |
| returning entire column values, it automatically selects and extracts a
 | |
| short fragment of document text to process and return. The snippet() function
 | |
| must be passed five parameters following the table name argument:
 | |
| 
 | |
| </p><ol>
 | |
|   <li> An integer indicating the index of the FTS table column to select
 | |
|        the returned text from. Columns are numbered from left to right 
 | |
|        starting at zero. A negative value indicates that the column should
 | |
|        be automatically selected.
 | |
| 
 | |
|   </li><li> The text to insert before each phrase match within the returned text.
 | |
| 
 | |
|   </li><li> The text to insert after each phrase match within the returned text.
 | |
| 
 | |
|   </li><li> The text to add to the start or end of the selected text to indicate
 | |
|        that the returned text does not occur at the start or end of its column,
 | |
|        respectively.
 | |
| 
 | |
|   </li><li> The maximum number of tokens in the returned text. This must be greater
 | |
|        than zero and equal to or less than 64. 
 | |
| </li></ol>
 | |
| 
 | |
| <a name="sorting_by_auxiliary_function_results"></a>
 | |
| <h2 tags="auxiliary function mapping" id="sorting_by_auxiliary_function_results"><span>5.2. </span>Sorting by Auxiliary Function Results</h2>
 | |
| 
 | |
| <p> All FTS5 tables feature a special hidden column named "rank". If the
 | |
| current query is not a full-text query (i.e. if it does not include a MATCH
 | |
| operator), the value of the "rank" column is always NULL. Otherwise, in a
 | |
| full-text query, column rank contains by default the same value as would be
 | |
| returned by executing the bm25() auxiliary function with no trailing 
 | |
| arguments.
 | |
| 
 | |
| </p><p> The difference between reading from the rank column and using the bm25()
 | |
| function directly within the query is only significant when sorting by the
 | |
| returned value. In this case, using "rank" is faster than using bm25().
 | |
| 
 | |
| </p><div class="codeblock"><pre><i>-- The following queries are logically equivalent. But the second may</i>
 | |
| <i>-- be faster, particularly if the caller abandons the query before</i>
 | |
| <i>-- all rows have been returned (or if the queries were modified to </i>
 | |
| <i>-- include LIMIT clauses).</i>
 | |
| SELECT * FROM fts WHERE fts MATCH ? ORDER BY bm25(fts);
 | |
| SELECT * FROM fts WHERE fts MATCH ? ORDER BY rank;
 | |
| </pre></div>
 | |
| 
 | |
| <p> Instead of using bm25() with no trailing arguments, the specific auxiliary
 | |
| function mapped to the rank column may be configured either on a per-query
 | |
| basis, or by setting a different persistent default for the FTS table.
 | |
| 
 | |
| </p><p> In order to change the mapping of the rank column for a single query, 
 | |
| a term similar to either of the following is added to the WHERE clause of a 
 | |
| query:
 | |
| 
 | |
| </p><div class="codeblock"><pre>rank MATCH 'auxiliary-function-name(arg1, arg2, ...)'
 | |
| rank = 'auxiliary-function-name(arg1, arg2, ...)'
 | |
| </pre></div>
 | |
| 
 | |
| <p> The right-hand-side of the MATCH or = operator must be a constant
 | |
| expression that evaluates to a string consisting of the auxiliary function to
 | |
| invoke, followed by zero or more comma separated arguments within parenthesis.
 | |
| Arguments must be SQL literals. For example:
 | |
| 
 | |
| </p><div class="codeblock"><pre><i>-- The following queries are logically equivalent. But the second may</i>
 | |
| <i>-- be faster. See above. </i>
 | |
| SELECT * FROM fts WHERE fts MATCH ? ORDER BY bm25(fts, 10.0, 5.0);
 | |
| SELECT * FROM fts WHERE fts MATCH ? AND rank MATCH 'bm25(10.0, 5.0)' ORDER BY rank;
 | |
| </pre></div>
 | |
| 
 | |
| <p> The table-valued function syntax may also be used to specify an alternative
 | |
| ranking function. In this case the text describing the ranking function should
 | |
| be specified as the second table-valued function argument. The following three
 | |
| queries are equivalent:
 | |
| 
 | |
| </p><div class="codeblock"><pre>SELECT * FROM fts WHERE fts MATCH ? AND rank MATCH 'bm25(10.0, 5.0)' ORDER BY rank;
 | |
| SELECT * FROM fts WHERE fts = ? AND rank = 'bm25(10.0, 5.0)' ORDER BY rank;
 | |
| SELECT * FROM fts WHERE fts(?, 'bm25(10.0, 5.0)') ORDER BY rank;
 | |
| </pre></div>
 | |
| 
 | |
| <p> The default mapping of the rank column for a table may be modified 
 | |
| using the <a href="fts5.html#the_rank_configuration_option">FTS5 rank configuration option</a>.
 | |
| 
 | |
| </p><h1 id="special_insert_commands"><span>6. </span>Special INSERT Commands</h1>
 | |
| 
 | |
| <a name="the_automerge_configuration_option"></a>
 | |
| <h2 tags="FTS5 automerge option" id="the_automerge_configuration_option"><span>6.1. </span>The 'automerge' Configuration Option</h2>
 | |
| 
 | |
| <p>
 | |
|   Instead of using a single data structure on disk to store the full-text
 | |
|   index, FTS5 uses a series of b-trees. Each time a new transaction is
 | |
|   committed, a new b-tree containing the contents of the committed transaction
 | |
|   is written into the database file. When the full-text index is queried, each
 | |
|   b-tree must be queried individually and the results merged before being
 | |
|   returned to the user.
 | |
| 
 | |
| </p><p>
 | |
|   In order to prevent the number of b-trees in the database from becoming too
 | |
|   large (slowing down queries), smaller b-trees are periodically merged into
 | |
|   single larger b-trees containing the same data. By default, this happens
 | |
|   automatically within INSERT, UPDATE or DELETE statements that modify the
 | |
|   full-text index. The 'automerge' parameter determines how many smaller
 | |
|   b-trees are merged together at a time. Setting it to a small value can
 | |
|   speed up queries (as they have to query and merge the results from fewer 
 | |
|   b-trees), but can also slow down writing to the database (as each INSERT,
 | |
|   UPDATE or DELETE statement has to do more work as part of the automatic
 | |
|   merging process).
 | |
| 
 | |
| </p><p>
 | |
|   Each of the b-trees that make up the full-text index is assigned to a "level"
 | |
|   based on its size. Level-0 b-trees are the smallest, as they contain the
 | |
|   contents of a single transaction. Higher level b-trees are the result of
 | |
|   merging two or more level-0 b-trees together and so they are larger. FTS5
 | |
|   begins to merge b-trees together once there exist <i>M</i> or more b-trees 
 | |
|   with the same level, where <i>M</i> is the value of the 'automerge' 
 | |
|   parameter.
 | |
| 
 | |
| </p><p>
 | |
|   The maximum allowed value for the 'automerge' parameter is 16. The default
 | |
|   value is 4. Setting the 'automerge' parameter to 0 disables the automatic 
 | |
|   incremental merging of b-trees altogether.
 | |
| 
 | |
| </p><div class="codeblock"><pre>INSERT INTO ft(ft, rank) VALUES('automerge', 8);
 | |
| </pre></div>
 | |
| 
 | |
| <h2 id="the_crisismerge_configuration_option"><span>6.2. </span>The 'crisismerge' Configuration Option</h2>
 | |
| 
 | |
| <p>The 'crisismerge' option is similar to 'automerge', in that it determines
 | |
| how and how often the component b-trees that make up the full-text index are
 | |
| merged together. Once there exist <i>C</i> or more b-trees on a single level
 | |
| within the full-text index, where <i>C</i> is the value of the 'crisismerge'
 | |
| option, all b-trees on the level are immediately merged into a single b-tree.
 | |
| 
 | |
| </p><p>The difference between this option and the 'automerge' option is that when
 | |
| the 'automerge' limit is reached FTS5 only begins to merge the b-trees
 | |
| together. Most of the work is performed as part of subsequent INSERT, 
 | |
| UPDATE or DELETE operations. Whereas when the 'crisismerge' limit is reached,
 | |
| the offending b-trees are all merged immediately. This means that an INSERT,
 | |
| UPDATE or DELETE that triggers a crisis-merge may take a long time to 
 | |
| complete.
 | |
| 
 | |
| </p><p>The default 'crisismerge' value is 16. There is no maximum limit. Attempting
 | |
| to set the 'crisismerge' parameter to a value of 0 or 1 is equivalent to
 | |
| setting it to the default value (16). It is an error to attempt to set the
 | |
| 'crisismerge' option to a negative value.
 | |
| 
 | |
| </p><div class="codeblock"><pre>INSERT INTO ft(ft, rank) VALUES('crisismerge', 16);
 | |
| </pre></div>
 | |
| 
 | |
| <a name="the_delete_command"></a>
 | |
| <h2 tags="FTS5 delete command" id="the_delete_command"><span>6.3. </span>The 'delete' Command</h2>
 | |
| 
 | |
| <p> This command is only available with <a href="fts5.html#external_content_tables">external content</a> and <a href="fts5.html#contentless_tables">contentless</a> tables. It
 | |
| is used to delete the index entries associated with a single row from the
 | |
| full-text index. This command and the <a href="fts5.html#the_delete_all_command">delete-all</a>
 | |
| command are the only ways to remove entries from the full-text index of a
 | |
| contentless table.
 | |
| 
 | |
| </p><p> In order to use this command to delete a row, the text value 'delete' 
 | |
| must be inserted into the special column with the same name as the table.
 | |
| The rowid of the row to delete is inserted into the rowid column. The
 | |
| values inserted into the other columns must match the values currently
 | |
| stored in the table. For example:
 | |
| 
 | |
| </p><div class="codeblock"><pre><i>-- Insert a row with rowid=14 into the fts5 table.</i>
 | |
| INSERT INTO ft(rowid, a, b, c) VALUES(14, $a, $b, $c);
 | |
| 
 | |
| <i>-- Remove the same row from the fts5 table.</i>
 | |
| INSERT INTO ft(ft, rowid, a, b, c) VALUES('delete', 14, $a, $b, $c);
 | |
| </pre></div>
 | |
| 
 | |
| <p> If the values "inserted" into the text columns as part of a 'delete'
 | |
| command are not the same as those currently stored within the table, the
 | |
| results may be unpredictable.
 | |
| 
 | |
| </p><p> The reason for this is easy to understand: When a document is inserted
 | |
| into the FTS5 table, an entry is added to the full-text index to record the
 | |
| position of each token within the new document. When a document is removed,
 | |
| the original data is required in order to determine the set of entries that
 | |
| need to be removed from the full-text index. So if the data supplied to FTS5
 | |
| when a row is deleted using this command is different from that used to
 | |
| determine the set of token instances when it was inserted, some full-text 
 | |
| index entries may not be correctly deleted, or FTS5 may try to remove index 
 | |
| entries that do not exist. This can leave the full-text index in an
 | |
| unpredictable state, making future query results unreliable.
 | |
| 
 | |
| </p><a name="the_delete_all_command"></a>
 | |
| <h2 tags="FTS5 delete-all command" id="the_delete_all_command"><span>6.4. </span>The 'delete-all' Command</h2>
 | |
| 
 | |
| <p> This command is only available with <a href="fts5.html#external_content_tables">external content</a> and <a href="fts5.html#contentless_tables">contentless</a> tables. It
 | |
| deletes all entries from the full-text index.
 | |
| 
 | |
| </p><div class="codeblock"><pre>INSERT INTO ft(ft) VALUES('delete-all');
 | |
| </pre></div>
 | |
| 
 | |
| <h2 id="the_integrity_check_command"><span>6.5. </span>The 'integrity-check' Command</h2>
 | |
| 
 | |
| <p> This command is used to verify that the full-text index is consistent 
 | |
| with the contents of the FTS5 table or <a href="fts5.html#external_content_tables">content 
 | |
| table</a>. It is not available with <a href="fts5.html#contentless_tables">contentless tables</a>.
 | |
| 
 | |
| </p><p>The integrity-check command is invoked by inserting the text value
 | |
| 'integrity-check' into the special column with the same name as the FTS5
 | |
| table. For example:
 | |
| 
 | |
| </p><div class="codeblock"><pre>INSERT INTO ft(ft) VALUES('integrity-check');
 | |
| </pre></div>
 | |
| 
 | |
| <p>If the full-text index is consistent with the contents of the table, the
 | |
| INSERT used to invoke the integrity-check command succeeds. Or, if any
 | |
| discrepancy is found, it fails with an <a href="rescode.html#corrupt_vtab">SQLITE_CORRUPT_VTAB</a> error.
 | |
| 
 | |
| </p><a name="the_merge_command"></a>
 | |
| <h2 tags="FTS5 merge command" id="the_merge_command"><span>6.6. </span>The 'merge' Command</h2>
 | |
| 
 | |
| <div class="codeblock"><pre>INSERT INTO ft(ft, rank) VALUES('merge', 500);
 | |
| </pre></div>
 | |
| 
 | |
| <p> This command merges b-tree structures together until roughly N pages
 | |
| of merged data have been written to the database, where N is the absolute
 | |
| value of the parameter specified as part of the 'merge' command. The size of
 | |
| each page is as configured by the <a href="fts5.html#the_pgsz_configuration_option">FTS5 pgsz option</a>.
 | |
| 
 | |
| </p><p> If the parameter is a positive value, B-tree structures are only eligible
 | |
| for merging if one of the following is true:
 | |
| 
 | |
| </p><ul>
 | |
|   <li> There are U or more such b-trees on a
 | |
|        single level (see the documentation for the <a href="fts5.html#the_automerge_configuration_option">FTS5 automerge option</a>
 | |
|        for an explanation of b-tree levels), where U is the value assigned
 | |
|        to the <a href="fts5.html#the_usermerge_configuration_option">FTS5 usermerge option</a> option.
 | |
|   </li><li> A merge has already been started (perhaps by a 'merge' command that
 | |
|        specified a negative parameter).
 | |
| </li></ul>
 | |
| 
 | |
| <p> It is possible to tell whether or not the 'merge' command found any 
 | |
| b-trees to merge together by checking the value returned by the
 | |
| <a href="c3ref/total_changes.html">sqlite3_total_changes()</a> API before and after the command is executed. If
 | |
| the difference between the two values is 2 or greater, then work was performed.
 | |
| If the difference is less than 2, then the 'merge' command was a no-op. In this
 | |
| case there is no reason to execute the same 'merge' command again, at least
 | |
| until after the FTS table is next updated.
 | |
| 
 | |
| </p><p> If the parameter is negative, and there are B-tree structures on more than
 | |
| one level within the FTS index, all B-tree structures are assigned to the same
 | |
| level before the merge operation is commenced. Additionally, if the parameter
 | |
| is negative, the value of the usermerge configuration option is not 
 | |
| respected - as few as two b-trees from the same level may be merged together.
 | |
| 
 | |
| </p><p> The above means that executing the 'merge' command with a negative
 | |
| parameter until the before and after difference in the return value of
 | |
| <a href="c3ref/total_changes.html">sqlite3_total_changes()</a> is less than two optimizes the FTS index in the
 | |
| same way as the <a href="fts5.html#the_optimize_command">FTS5 optimize command</a>. However, if a new b-tree is added
 | |
| to the FTS index while this process is ongoing, FTS5 will move the new 
 | |
| b-tree to the same level as the existing b-trees and restart the merge. To
 | |
| avoid this, only the first call to 'merge' should specify a negative parameter.
 | |
| Each subsequent call to 'merge' should specify a positive value so that the
 | |
| merge started by the first call is run to completion even if new b-trees are
 | |
| added to the FTS index.
 | |
| 
 | |
| </p><a name="the_optimize_command"></a>
 | |
| <h2 tags="FTS5 optimize command" id="the_optimize_command"><span>6.7. </span>The 'optimize' Command</h2>
 | |
| 
 | |
| <p>This command merges all individual b-trees that currently make up the
 | |
| full-text index into a single large b-tree structure. This ensures that the
 | |
| full-text index consumes the minimum space within the database and is in the
 | |
| fastest form to query.
 | |
| 
 | |
| </p><p>Refer to the documentation for the <a href="fts5.html#the_automerge_configuration_option">FTS5 automerge option</a> for more details
 | |
| regarding the relationship between the full-text index and its component
 | |
| b-trees.
 | |
| 
 | |
| </p><div class="codeblock"><pre>INSERT INTO ft(ft) VALUES('optimize');
 | |
| </pre></div>
 | |
| 
 | |
| <p>Because it reorganizes the entire FTS index, the optimize command can 
 | |
| take a long time to run. The <a href="fts5.html#the_merge_command">FTS5 merge command</a> can be used to divide
 | |
| the work of optimizing the FTS index into multiple steps. To do this:
 | |
| 
 | |
| </p><ul>
 | |
|   <li> Invoke the 'merge' command once with the parameter set to -N, then
 | |
|   </li><li> Invoke the 'merge' command zero or more times with the parameter set to N.
 | |
| </li></ul>
 | |
| 
 | |
| <p>where N is the number of pages of data to merge within each invocation of
 | |
| the merge command. The application should stop invoking merge when the
 | |
| difference in the value returned by the sqlite3_total_changes() function before
 | |
| and after the merge command drops to below two. The merge commands may be
 | |
| issued as part of the same or separate transactions, and by the same or
 | |
| different database clients. Refer to the documentation for the 
 | |
| <a href="fts5.html#the_merge_command">merge command</a> for further details.
 | |
| 
 | |
| </p><a name="the_pgsz_configuration_option"></a>
 | |
| <h2 tags="FTS5 pgsz option" id="the_pgsz_configuration_option"><span>6.8. </span>The 'pgsz' Configuration Option</h2>
 | |
| 
 | |
| <p> This command is used to set the persistent "pgsz" option.
 | |
| 
 | |
| </p><p> The full-text index maintained by FTS5 is stored as a series of fixed-size
 | |
| blobs in a database table. It is not strictly necessary for all blobs that make
 | |
| up a full-text index to be the same size. The pgsz option determines the size
 | |
| of all blobs created by subsequent index writers. The default value is 1000.
 | |
| 
 | |
| </p><div class="codeblock"><pre>INSERT INTO ft(ft, rank) VALUES('pgsz', 4072);
 | |
| </pre></div>
 | |
| 
 | |
| <a name="the_rank_configuration_option"></a>
 | |
| <h2 tags="FTS5 rank configuration option" id="the_rank_configuration_option"><span>6.9. </span>The 'rank' Configuration Option</h2>
 | |
| 
 | |
| <p> This command is used to set the persistent "rank" option.
 | |
| 
 | |
| </p><p> The rank option is used to change the default auxiliary function mapping
 | |
| for the rank column. The option should be set to a text value in the same
 | |
| format as described for <a href="fts5.html#sorting_by_auxiliary_function_results">"rank MATCH ?"</a> terms 
 | |
| above. For example:
 | |
| 
 | |
| </p><div class="codeblock"><pre>INSERT INTO ft(ft, rank) VALUES('rank', 'bm25(10.0, 5.0)');
 | |
| </pre></div>
 | |
| 
 | |
| <a name="the_rebuild_command"></a>
 | |
| <h2 tags="FTS5 rebuild command" id="the_rebuild_command"><span>6.10. </span>The 'rebuild' Command</h2>
 | |
| 
 | |
| <p> This command first deletes the entire full-text index, then rebuilds it
 | |
| based on the contents of the table or <a href="fts5.html#external_content_tables">content
 | |
| table</a>.  It is not available with <a href="fts5.html#contentless_tables">contentless
 | |
| tables</a>.
 | |
| 
 | |
| </p><div class="codeblock"><pre>INSERT INTO ft(ft) VALUES('rebuild');
 | |
| </pre></div>
 | |
| 
 | |
| <a name="the_usermerge_configuration_option"></a>
 | |
| <h2 tags="FTS5 usermerge option" id="the_usermerge_configuration_option"><span>6.11. </span>The 'usermerge' Configuration Option</h2>
 | |
| 
 | |
| <p> This command is used to set the persistent "usermerge" option.
 | |
| 
 | |
| </p><p> The usermerge option is similar to the automerge and crisismerge options.
 | |
| It is the minimum number of b-tree segments that will be merged together by
 | |
| a 'merge' command with a positive parameter. For example:
 | |
| 
 | |
| </p><div class="codeblock"><pre>INSERT INTO ft(ft, rank) VALUES('usermerge', 4);
 | |
| </pre></div>
 | |
| 
 | |
| <p> The default value of the usermerge option is 4. The minimum allowed value
 | |
| is 2, and the maximum 16.
 | |
| 
 | |
| </p><a name="extending_fts5"></a>
 | |
| <h1 tags="Extending FTS5" id="extending_fts5"><span>7. </span>Extending FTS5</h1>
 | |
| 
 | |
| <p>FTS5 features APIs allowing it to be extended by:
 | |
| 
 | |
| </p><ul>
 | |
|   <li> Adding new auxiliary functions implemented in C, and
 | |
|   </li><li> Adding new tokenizers, also implemented in C.
 | |
| </li></ul>
 | |
| 
 | |
| <p> The built-in tokenizers and auxiliary functions described in this
 | |
| document are all implemented using the publicly available API described
 | |
| below.
 | |
| 
 | |
| </p><p> Before a new auxiliary function or tokenizer implementation may be 
 | |
| registered with FTS5, an application must obtain a pointer to the "fts5_api"
 | |
| structure. There is one fts5_api structure for each database connection with
 | |
| which the FTS5 extension is registered. To obtain the pointer, the application
 | |
| invokes the SQL user-defined function fts5() with a single argument.  That
 | |
| argument must be set to a pointer to a pointer to an fts5_api object
 | |
| using the <a href="c3ref/bind_blob.html">sqlite3_bind_pointer()</a> interface.
 | |
| The following example code demonstrates the technique:
 | |
| 
 | |
| </p><div class="codeblock"><pre><i>/*
 | |
| ** Return a pointer to the fts5_api pointer for database connection db.
 | |
| ** If an error occurs, return NULL and leave an error in the database 
 | |
| ** handle (accessible using sqlite3_errcode()/errmsg()).
 | |
| */</i>
 | |
| fts5_api *fts5_api_from_db(sqlite3 *db){
 | |
|   fts5_api *pRet = 0;
 | |
|   sqlite3_stmt *pStmt = 0;
 | |
| 
 | |
|   if( SQLITE_OK==sqlite3_prepare(db, "SELECT fts5(?1)", -1, &pStmt, 0) ){
 | |
|     sqlite3_bind_pointer(pStmt, (void*)&pRet, "fts5_api_ptr", NULL);
 | |
|     sqlite3_step(pStmt);
 | |
|   }
 | |
|   sqlite3_finalize(pStmt);
 | |
|   return pRet;
 | |
| }
 | |
| </pre></div>
 | |
| 
 | |
| <p><b>Backwards Compatibility Warning:</b>
 | |
| Prior to SQLite version 3.20.0 (2017-08-01), the fts5() worked slightly
 | |
| differently.  Older applications that extend FTS5 must be revised to use 
 | |
| the new technique shown above.
 | |
| 
 | |
| </p><p> The fts5_api structure is defined as follows. It exposes three methods, 
 | |
| one each for registering new auxiliary functions and tokenizers, and one for
 | |
| retrieving existing tokenizer. The latter is intended to facilitate the
 | |
| implementation of "tokenizer wrappers" similar to the built-in
 | |
| porter tokenizer.
 | |
| 
 | |
| </p><div class="codeblock"><pre>typedef struct fts5_api fts5_api;
 | |
| struct fts5_api {
 | |
|   int iVersion;                   <i>/* Currently always set to 2 */</i>
 | |
| 
 | |
|   <i>/* Create a new tokenizer */</i>
 | |
|   int (*xCreateTokenizer)(
 | |
|     fts5_api *pApi,
 | |
|     const char *zName,
 | |
|     void *pContext,
 | |
|     fts5_tokenizer *pTokenizer,
 | |
|     void (*xDestroy)(void*)
 | |
|   );
 | |
| 
 | |
|   <i>/* Find an existing tokenizer */</i>
 | |
|   int (*xFindTokenizer)(
 | |
|     fts5_api *pApi,
 | |
|     const char *zName,
 | |
|     void **ppContext,
 | |
|     fts5_tokenizer *pTokenizer
 | |
|   );
 | |
| 
 | |
|   <i>/* Create a new auxiliary function */</i>
 | |
|   int (*xCreateFunction)(
 | |
|     fts5_api *pApi,
 | |
|     const char *zName,
 | |
|     void *pContext,
 | |
|     fts5_extension_function xFunction,
 | |
|     void (*xDestroy)(void*)
 | |
|   );
 | |
| };
 | |
| </pre></div>
 | |
| 
 | |
| <p> To invoke a method of the fts5_api object, the fts5_api pointer itself
 | |
| should be passed as the methods first argument followed by the other, method
 | |
| specific, arguments. For example:
 | |
| 
 | |
| </p><div class="codeblock"><pre>rc = pFts5Api->xCreateTokenizer(pFts5Api, ... other args ...);
 | |
| </pre></div>
 | |
| 
 | |
| <p> The fts5_api structure methods are described individually in the following
 | |
| sections.
 | |
| 
 | |
| </p><a name="custom_tokenizers"></a>
 | |
| <h2 tags="custom tokenizers" id="custom_tokenizers"><span>7.1. </span>Custom Tokenizers</h2>
 | |
| 
 | |
| <p> To create a custom tokenizer, an application must implement three
 | |
| functions: a tokenizer constructor (xCreate), a destructor (xDelete) and a
 | |
| function to do the actual tokenization (xTokenize). The type of each
 | |
| function is as for the member variables of the fts5_tokenizer struct:
 | |
| 
 | |
| </p><div class="codeblock"><pre>typedef struct Fts5Tokenizer Fts5Tokenizer;
 | |
| typedef struct fts5_tokenizer fts5_tokenizer;
 | |
| struct fts5_tokenizer {
 | |
|   int (*xCreate)(void*, const char **azArg, int nArg, Fts5Tokenizer **ppOut);
 | |
|   void (*xDelete)(Fts5Tokenizer*);
 | |
|   int (*xTokenize)(Fts5Tokenizer*, 
 | |
|       void *pCtx,
 | |
|       int flags,            <i>/* Mask of FTS5_TOKENIZE_* flags */</i>
 | |
|       const char *pText, int nText, 
 | |
|       int (*xToken)(
 | |
|         void *pCtx,         <i>/* Copy of 2nd argument to xTokenize() */</i>
 | |
|         int tflags,         <i>/* Mask of FTS5_TOKEN_* flags */</i>
 | |
|         const char *pToken, <i>/* Pointer to buffer containing token */</i>
 | |
|         int nToken,         <i>/* Size of token in bytes */</i>
 | |
|         int iStart,         <i>/* Byte offset of token within input text */</i>
 | |
|         int iEnd            <i>/* Byte offset of end of token within input text */</i>
 | |
|       )
 | |
|   );
 | |
| };
 | |
| 
 | |
| <i>/* Flags that may be passed as the third argument to xTokenize() */</i>
 | |
| #define FTS5_TOKENIZE_QUERY     0x0001
 | |
| #define FTS5_TOKENIZE_PREFIX    0x0002
 | |
| #define FTS5_TOKENIZE_DOCUMENT  0x0004
 | |
| #define FTS5_TOKENIZE_AUX       0x0008
 | |
| 
 | |
| <i>/* Flags that may be passed by the tokenizer implementation back to FTS5
 | |
| ** as the third argument to the supplied xToken callback. */</i>
 | |
| #define FTS5_TOKEN_COLOCATED    0x0001      <i>/* Same position as prev. token */</i>
 | |
| </pre></div>
 | |
| 
 | |
| <p> The implementation is registered with the FTS5 module by calling the
 | |
| xCreateTokenizer() method of the fts5_api object. If there is already a
 | |
| tokenizer with the same name, it is replaced.  If a non-NULL xDestroy parameter
 | |
| is passed to xCreateTokenizer(), it is invoked with a copy of the pContext
 | |
| pointer passed as the only argument when the database handle is closed or when
 | |
| the tokenizer is replaced.
 | |
| 
 | |
| </p><p> If successful, xCreateTokenizer() returns SQLITE_OK. Otherwise, it
 | |
| returns an SQLite error code. In this case the xDestroy function is <b>not</b> 
 | |
| invoked.
 | |
| 
 | |
| </p><p> When an FTS5 table uses the custom tokenizer, the FTS5 core calls xCreate()
 | |
| once to create a tokenizer, then xTokenize() zero or more times to tokenize
 | |
| strings, then xDelete() to free any resources allocated by xCreate(). More
 | |
| specifically:
 | |
| 
 | |
| </p><dl>
 | |
| <dt><b>xCreate:</b></dt><dd><p style="margin-top:0">
 | |
|    This function is used to allocate and initialize a tokenizer instance.
 | |
|    A tokenizer instance is required to actually tokenize text.
 | |
| </p><p>
 | |
|    The first argument passed to this function is a copy of the (void*)
 | |
|    pointer provided by the application when the fts5_tokenizer object
 | |
|    was registered with FTS5 (the third argument to xCreateTokenizer()). 
 | |
|    The second and third arguments are an array of nul-terminated strings
 | |
|    containing the tokenizer arguments, if any, specified following the
 | |
|    tokenizer name as part of the CREATE VIRTUAL TABLE statement used
 | |
|    to create the FTS5 table.
 | |
| </p><p>
 | |
|    The final argument is an output variable. If successful, (*ppOut) 
 | |
|    should be set to point to the new tokenizer handle and SQLITE_OK
 | |
|    returned. If an error occurs, some value other than SQLITE_OK should
 | |
|    be returned. In this case, fts5 assumes that the final value of *ppOut 
 | |
|    is undefined.
 | |
| </p><p>
 | |
| </p></dd><dt><b> xDelete:</b></dt><dd><p style="margin-top:0">
 | |
|    This function is invoked to delete a tokenizer handle previously
 | |
|    allocated using xCreate(). Fts5 guarantees that this function will
 | |
|    be invoked exactly once for each successful call to xCreate().
 | |
| </p><p>
 | |
| </p></dd><dt><b> xTokenize:</b></dt><dd><p style="margin-top:0">
 | |
|    This function is expected to tokenize the nText byte string indicated 
 | |
|    by argument pText. pText may or may not be nul-terminated. The first
 | |
|    argument passed to this function is a pointer to an Fts5Tokenizer object
 | |
|    returned by an earlier call to xCreate().
 | |
| </p><p>
 | |
|    The second argument indicates the reason that FTS5 is requesting
 | |
|    tokenization of the supplied text. This is always one of the following
 | |
|    four values:
 | |
| </p><p>
 | |
|    </p><ul><li> <b>FTS5_TOKENIZE_DOCUMENT</b> - A document is being inserted into
 | |
|             or removed from the FTS table. The tokenizer is being invoked to
 | |
|             determine the set of tokens to add to (or delete from) the
 | |
|             FTS index.
 | |
| <p>
 | |
|        </p></li><li> <b>FTS5_TOKENIZE_QUERY</b> - A MATCH query is being executed 
 | |
|             against the FTS index. The tokenizer is being called to tokenize 
 | |
|             a bareword or quoted string specified as part of the query.
 | |
| <p>
 | |
|        </p></li><li> <b>(FTS5_TOKENIZE_QUERY | FTS5_TOKENIZE_PREFIX)</b> - Same as
 | |
|             FTS5_TOKENIZE_QUERY, except that the bareword or quoted string is
 | |
|             followed by a "*" character, indicating that the last token
 | |
|             returned by the tokenizer will be treated as a token prefix.
 | |
| <p>
 | |
|        </p></li><li> <b>FTS5_TOKENIZE_AUX</b> - The tokenizer is being invoked to 
 | |
|             satisfy an fts5_api.xTokenize() request made by an auxiliary
 | |
|             function. Or an fts5_api.xColumnSize() request made by the same
 | |
|             on a columnsize=0 database.  
 | |
|    </li></ul>
 | |
| <p>
 | |
|    For each token in the input string, the supplied callback xToken() must
 | |
|    be invoked. The first argument to it should be a copy of the pointer
 | |
|    passed as the second argument to xTokenize(). The third and fourth
 | |
|    arguments are a pointer to a buffer containing the token text, and the
 | |
|    size of the token in bytes. The 4th and 5th arguments are the byte offsets
 | |
|    of the first byte of and first byte immediately following the text from
 | |
|    which the token is derived within the input.
 | |
| </p><p>
 | |
|    The second argument passed to the xToken() callback ("tflags") should
 | |
|    normally be set to 0. The exception is if the tokenizer supports 
 | |
|    synonyms. In this case see the discussion below for details.
 | |
| </p><p>
 | |
|    FTS5 assumes the xToken() callback is invoked for each token in the 
 | |
|    order that they occur within the input text.
 | |
| </p><p>
 | |
|    If an xToken() callback returns any value other than SQLITE_OK, then
 | |
|    the tokenization should be abandoned and the xTokenize() method should
 | |
|    immediately return a copy of the xToken() return value. Or, if the
 | |
|    input buffer is exhausted, xTokenize() should return SQLITE_OK. Finally,
 | |
|    if an error occurs with the xTokenize() implementation itself, it
 | |
|    may abandon the tokenization and return any error code other than
 | |
|    SQLITE_OK or SQLITE_DONE.
 | |
| </p><p>
 | |
| </p></dd></dl><h3 id="synonym_support"><span>7.1.1. </span>Synonym Support</h3>
 | |
| <p>
 | |
|    Custom tokenizers may also support synonyms. Consider a case in which a
 | |
|    user wishes to query for a phrase such as "first place". Using the 
 | |
|    built-in tokenizers, the FTS5 query 'first + place' will match instances
 | |
|    of "first place" within the document set, but not alternative forms
 | |
|    such as "1st place". In some applications, it would be better to match
 | |
|    all instances of "first place" or "1st place" regardless of which form
 | |
|    the user specified in the MATCH query text.
 | |
| </p><p>
 | |
|    There are several ways to approach this in FTS5:
 | |
| </p><p>
 | |
|    </p><ol><li> By mapping all synonyms to a single token. In this case, using
 | |
|             the above example, this means that the tokenizer returns the
 | |
|             same token for inputs "first" and "1st". Say that token is in
 | |
|             fact "first", so that when the user inserts the document "I won
 | |
|             1st place" entries are added to the index for tokens "i", "won",
 | |
|             "first" and "place". If the user then queries for '1st + place',
 | |
|             the tokenizer substitutes "first" for "1st" and the query works
 | |
|             as expected.
 | |
| <p>
 | |
|        </p></li><li> By querying the index for all synonyms of each query term
 | |
|             separately. In this case, when tokenizing query text, the
 | |
|             tokenizer may provide multiple synonyms for a single term 
 | |
|             within the document. FTS5 then queries the index for each 
 | |
|             synonym individually. For example, faced with the query:
 | |
| <p>
 | |
|    </p><div class="codeblock"><pre>... MATCH 'first place'
 | |
| </pre></div>
 | |
| <p>
 | |
|             the tokenizer offers both "1st" and "first" as synonyms for the
 | |
|             first token in the MATCH query and FTS5 effectively runs a query 
 | |
|             similar to:
 | |
| </p><p>
 | |
|    </p><div class="codeblock"><pre>... MATCH '(first OR 1st) place'
 | |
| </pre></div>
 | |
| <p>
 | |
|             except that, for the purposes of auxiliary functions, the query
 | |
|             still appears to contain just two phrases - "(first OR 1st)" 
 | |
|             being treated as a single phrase.
 | |
| </p><p>
 | |
|        </p></li><li> By adding multiple synonyms for a single term to the FTS index.
 | |
|             Using this method, when tokenizing document text, the tokenizer
 | |
|             provides multiple synonyms for each token. So that when a 
 | |
|             document such as "I won first place" is tokenized, entries are
 | |
|             added to the FTS index for "i", "won", "first", "1st" and
 | |
|             "place".
 | |
| <p>
 | |
|             This way, even if the tokenizer does not provide synonyms
 | |
|             when tokenizing query text (it should not - to do so would be
 | |
|             inefficient), it doesn't matter if the user queries for 
 | |
|             'first + place' or '1st + place', as there are entries in the
 | |
|             FTS index corresponding to both forms of the first token.
 | |
|    </p></li></ol>
 | |
| <p>
 | |
|    Whether it is parsing document or query text, any call to xToken that
 | |
|    specifies a <i>tflags</i> argument with the FTS5_TOKEN_COLOCATED bit
 | |
|    is considered to supply a synonym for the previous token. For example,
 | |
|    when parsing the document "I won first place", a tokenizer that supports
 | |
|    synonyms would call xToken() 5 times, as follows:
 | |
| </p><p>
 | |
|    </p><div class="codeblock"><pre>xToken(pCtx, 0, "i",                      1,  0,  1);
 | |
| xToken(pCtx, 0, "won",                    3,  2,  5);
 | |
| xToken(pCtx, 0, "first",                  5,  6, 11);
 | |
| xToken(pCtx, FTS5_TOKEN_COLOCATED, "1st", 3,  6, 11);
 | |
| xToken(pCtx, 0, "place",                  5, 12, 17);
 | |
| </pre></div>
 | |
| <p>
 | |
|    It is an error to specify the FTS5_TOKEN_COLOCATED flag the first time
 | |
|    xToken() is called. Multiple synonyms may be specified for a single token
 | |
|    by making multiple calls to xToken(FTS5_TOKEN_COLOCATED) in sequence. 
 | |
|    There is no limit to the number of synonyms that may be provided for a
 | |
|    single token.
 | |
| </p><p>
 | |
|    In many cases, method (1) above is the best approach. It does not add 
 | |
|    extra data to the FTS index or require FTS5 to query for multiple terms,
 | |
|    so it is efficient in terms of disk space and query speed. However, it
 | |
|    does not support prefix queries very well. If, as suggested above, the
 | |
|    token "first" is substituted for "1st" by the tokenizer, then the query:
 | |
| </p><p>
 | |
|    </p><div class="codeblock"><pre>... MATCH '1s*'
 | |
| </pre></div>
 | |
| <p>
 | |
|    will not match documents that contain the token "1st" (as the tokenizer
 | |
|    will probably not map "1s" to any prefix of "first").
 | |
| </p><p>
 | |
|    For full prefix support, method (3) may be preferred. In this case, 
 | |
|    because the index contains entries for both "first" and "1st", prefix
 | |
|    queries such as 'fi*' or '1s*' will match correctly. However, because
 | |
|    extra entries are added to the FTS index, this method uses more space
 | |
|    within the database.
 | |
| </p><p>
 | |
|    Method (2) offers a midpoint between (1) and (3). Using this method,
 | |
|    a query such as '1s*' will match documents that contain the literal 
 | |
|    token "1st", but not "first" (assuming the tokenizer is not able to
 | |
|    provide synonyms for prefixes). However, a non-prefix query like '1st'
 | |
|    will match against "1st" and "first". This method does not require
 | |
|    extra disk space, as no extra entries are added to the FTS index. 
 | |
|    On the other hand, it may require more CPU cycles to run MATCH queries,
 | |
|    as separate queries of the FTS index are required for each synonym.
 | |
| </p><p>
 | |
|    When using methods (2) or (3), it is important that the tokenizer only
 | |
|    provide synonyms when tokenizing document text (method (2)) or query
 | |
|    text (method (3)), not both. Doing so will not cause any errors, but is
 | |
|    inefficient.
 | |
| 
 | |
| 
 | |
| 
 | |
| </p><a name="custom_auxiliary_functions"></a>
 | |
| <h2 tags="FTS5 custom auxiliary functions" id="custom_auxiliary_functions"><span>7.2. </span>Custom Auxiliary Functions</h2>
 | |
| 
 | |
| <p> Implementing a custom auxiliary function is similar to implementing a
 | |
| <a href="appfunc.html">scalar SQL function</a>. The implementation
 | |
| should be a C function of type fts5_extension_function, defined as follows:
 | |
| 
 | |
| </p><div class="codeblock"><pre>typedef struct Fts5ExtensionApi Fts5ExtensionApi;
 | |
| typedef struct Fts5Context Fts5Context;
 | |
| typedef struct Fts5PhraseIter Fts5PhraseIter;
 | |
| 
 | |
| typedef void (*fts5_extension_function)(
 | |
|   const Fts5ExtensionApi *pApi,   <i>/* API offered by current FTS version */</i>
 | |
|   Fts5Context *pFts,              <i>/* First arg to pass to pApi functions */</i>
 | |
|   sqlite3_context *pCtx,          <i>/* Context for returning result/error */</i>
 | |
|   int nVal,                       <i>/* Number of values in apVal[] array */</i>
 | |
|   sqlite3_value **apVal           <i>/* Array of trailing arguments */</i>
 | |
| );
 | |
| </pre></div>
 | |
| 
 | |
| <p> The implementation is registered with the FTS5 module by calling the
 | |
| xCreateFunction() method of the fts5_api object. If there is already an
 | |
| auxiliary function with the same name, it is replaced by the new function.
 | |
| If a non-NULL xDestroy parameter is passed to xCreateFunction(), it is invoked
 | |
| with a copy of the pContext pointer passed as the only argument when the
 | |
| database handle is closed or when the registered auxiliary function is
 | |
| replaced.
 | |
| 
 | |
| </p><p> If successful, xCreateFunction() returns SQLITE_OK. Otherwise, it
 | |
| returns an SQLite error code. In this case the xDestroy function is <b>not</b> 
 | |
| invoked.
 | |
| 
 | |
| </p><p> The final three arguments passed to the auxiliary function callback are
 | |
| similar to the three arguments passed to the implementation of a scalar SQL
 | |
| function. All arguments except the first passed to the auxiliary function are
 | |
| available to the implementation in the apVal[] array. The
 | |
| implementation should return a result or error via the content handle pCtx.
 | |
| 
 | |
| </p><p> The first argument passed to an auxiliary function callback is a pointer
 | |
| to a structure containing methods that may be invoked in order to obtain
 | |
| information regarding the current query or row. The second argument is an
 | |
| opaque handle that should be passed as the first argument to any such method 
 | |
| invocation. For example, the following auxiliary function definition returns
 | |
| the total number of tokens in all columns of the current row:
 | |
| 
 | |
| </p><div class="codeblock"><pre><i>/*
 | |
| ** Implementation of an auxiliary function that returns the number
 | |
| ** of tokens in the current row (including all columns).
 | |
| */</i>
 | |
| static void column_size_imp(
 | |
|   const Fts5ExtensionApi *pApi,
 | |
|   Fts5Context *pFts,
 | |
|   sqlite3_context *pCtx,
 | |
|   int nVal,
 | |
|   sqlite3_value **apVal
 | |
| ){
 | |
|   int rc;
 | |
|   int nToken;
 | |
|   rc = pApi->xColumnSize(pFts, -1, &nToken);
 | |
|   if( rc==SQLITE_OK ){
 | |
|     sqlite3_result_int(pCtx, nToken);
 | |
|   }else{
 | |
|     sqlite3_result_error_code(pCtx, rc);
 | |
|   }
 | |
| }
 | |
| </pre></div>
 | |
| 
 | |
| <p>The following section describes the API offered to auxiliary function
 | |
| implementations in detail. Further examples may be found in the "fts5_aux.c"
 | |
| file of the source code.
 | |
| 
 | |
| </p><a name="_custom_auxiliary_functions_api_reference_"></a>
 | |
| <h3 tags="custom auxiliary functions" id="_custom_auxiliary_functions_api_reference_"><span>7.2.1. </span>
 | |
|   Custom Auxiliary Functions API Reference
 | |
| </h3>
 | |
| 
 | |
| <div class="codeblock"><pre>struct Fts5ExtensionApi {
 | |
|   int iVersion;                   <i>/* Currently always set to 3 */</i>
 | |
| 
 | |
|   void *(*<a href="#xUserData">xUserData</a>)(Fts5Context*);
 | |
| 
 | |
|   int (*<a href="#xColumnCount">xColumnCount</a>)(Fts5Context*);
 | |
|   int (*<a href="#xRowCount">xRowCount</a>)(Fts5Context*, sqlite3_int64 *pnRow);
 | |
|   int (*<a href="#xColumnTotalSize">xColumnTotalSize</a>)(Fts5Context*, int iCol, sqlite3_int64 *pnToken);
 | |
| 
 | |
|   int (*<a href="#xTokenize">xTokenize</a>)(Fts5Context*, 
 | |
|     const char *pText, int nText, <i>/* Text to tokenize */</i>
 | |
|     void *pCtx,                   <i>/* Context passed to xToken() */</i>
 | |
|     int (*xToken)(void*, int, const char*, int, int, int)       <i>/* Callback */</i>
 | |
|   );
 | |
| 
 | |
|   int (*<a href="#xPhraseCount">xPhraseCount</a>)(Fts5Context*);
 | |
|   int (*<a href="#xPhraseSize">xPhraseSize</a>)(Fts5Context*, int iPhrase);
 | |
| 
 | |
|   int (*<a href="#xInstCount">xInstCount</a>)(Fts5Context*, int *pnInst);
 | |
|   int (*<a href="#xInst">xInst</a>)(Fts5Context*, int iIdx, int *piPhrase, int *piCol, int *piOff);
 | |
| 
 | |
|   sqlite3_int64 (*<a href="#xRowid">xRowid</a>)(Fts5Context*);
 | |
|   int (*<a href="#xColumnText">xColumnText</a>)(Fts5Context*, int iCol, const char **pz, int *pn);
 | |
|   int (*<a href="#xColumnSize">xColumnSize</a>)(Fts5Context*, int iCol, int *pnToken);
 | |
| 
 | |
|   int (*<a href="#xQueryPhrase">xQueryPhrase</a>)(Fts5Context*, int iPhrase, void *pUserData,
 | |
|     int(*)(const Fts5ExtensionApi*,Fts5Context*,void*)
 | |
|   );
 | |
|   int (*<a href="#xSetAuxdata">xSetAuxdata</a>)(Fts5Context*, void *pAux, void(*xDelete)(void*));
 | |
|   void *(*<a href="#xGetAuxdata">xGetAuxdata</a>)(Fts5Context*, int bClear);
 | |
| 
 | |
|   int (*<a href="#xPhraseFirst">xPhraseFirst</a>)(Fts5Context*, int iPhrase, Fts5PhraseIter*, int*, int*);
 | |
|   void (*<a href="#xPhraseNext">xPhraseNext</a>)(Fts5Context*, Fts5PhraseIter*, int *piCol, int *piOff);
 | |
| 
 | |
|   int (*<a href="#xPhraseFirst">xPhraseFirst</a>Column)(Fts5Context*, int iPhrase, Fts5PhraseIter*, int*);
 | |
|   void (*<a href="#xPhraseNext">xPhraseNext</a>Column)(Fts5Context*, Fts5PhraseIter*, int *piCol);
 | |
| };
 | |
| </pre></div>
 | |
| 
 | |
| <dl>
 | |
| <dt style="white-space:pre;font-family:monospace;font-size:120%" id="xUserData">
 | |
| <b>void *(*xUserData)(Fts5Context*)</b></dt><dd>
 | |
| <p style="margin-top:0.1em">
 | |
| Return a copy of the context pointer the extension function was 
 | |
|    registered with.
 | |
| </p>
 | |
| </dd>
 | |
| <dt style="white-space:pre;font-family:monospace;font-size:120%" id="xColumnTotalSize">
 | |
| <b>int (*xColumnTotalSize)(Fts5Context*, int iCol, sqlite3_int64 *pnToken)</b></dt><dd>
 | |
| <p style="margin-top:0.1em">
 | |
| If parameter iCol is less than zero, set output variable *pnToken
 | |
|    to the total number of tokens in the FTS5 table. Or, if iCol is
 | |
|    non-negative but less than the number of columns in the table, return
 | |
|    the total number of tokens in column iCol, considering all rows in 
 | |
|    the FTS5 table.
 | |
| </p>
 | |
| 
 | |
| <p>
 | |
|    If parameter iCol is greater than or equal to the number of columns
 | |
|    in the table, SQLITE_RANGE is returned. Or, if an error occurs (e.g.
 | |
|    an OOM condition or IO error), an appropriate SQLite error code is 
 | |
|    returned.
 | |
| </p>
 | |
| </dd>
 | |
| <dt style="white-space:pre;font-family:monospace;font-size:120%" id="xColumnCount">
 | |
| <b>int (*xColumnCount)(Fts5Context*)</b></dt><dd>
 | |
| <p style="margin-top:0.1em">
 | |
| Return the number of columns in the table.
 | |
| </p>
 | |
| </dd>
 | |
| <dt style="white-space:pre;font-family:monospace;font-size:120%" id="xColumnSize">
 | |
| <b>int (*xColumnSize)(Fts5Context*, int iCol, int *pnToken)</b></dt><dd>
 | |
| <p style="margin-top:0.1em">
 | |
| If parameter iCol is less than zero, set output variable *pnToken
 | |
|    to the total number of tokens in the current row. Or, if iCol is
 | |
|    non-negative but less than the number of columns in the table, set
 | |
|    *pnToken to the number of tokens in column iCol of the current row.
 | |
| </p>
 | |
| 
 | |
| <p>
 | |
|    If parameter iCol is greater than or equal to the number of columns
 | |
|    in the table, SQLITE_RANGE is returned. Or, if an error occurs (e.g.
 | |
|    an OOM condition or IO error), an appropriate SQLite error code is 
 | |
|    returned.
 | |
| </p>
 | |
| 
 | |
| <p>
 | |
|    This function may be quite inefficient if used with an FTS5 table
 | |
|    created with the "columnsize=0" option.
 | |
| </p>
 | |
| </dd>
 | |
| <dt style="white-space:pre;font-family:monospace;font-size:120%" id="xColumnText">
 | |
| <b>int (*xColumnText)(Fts5Context*, int iCol, const char **pz, int *pn)</b></dt><dd>
 | |
| <p style="margin-top:0.1em">
 | |
| This function attempts to retrieve the text of column iCol of the
 | |
|    current document. If successful, (*pz) is set to point to a buffer
 | |
|    containing the text in utf-8 encoding, (*pn) is set to the size in bytes
 | |
|    (not characters) of the buffer and SQLITE_OK is returned. Otherwise,
 | |
|    if an error occurs, an SQLite error code is returned and the final values
 | |
|    of (*pz) and (*pn) are undefined.
 | |
| </p>
 | |
| </dd>
 | |
| <dt style="white-space:pre;font-family:monospace;font-size:120%" id="xPhraseCount">
 | |
| <b>int (*xPhraseCount)(Fts5Context*)</b></dt><dd>
 | |
| <p style="margin-top:0.1em">
 | |
| Returns the number of phrases in the current query expression.
 | |
| </p>
 | |
| </dd>
 | |
| <dt style="white-space:pre;font-family:monospace;font-size:120%" id="xPhraseSize">
 | |
| <b>int (*xPhraseSize)(Fts5Context*, int iPhrase)</b></dt><dd>
 | |
| <p style="margin-top:0.1em">
 | |
| Returns the number of tokens in phrase iPhrase of the query. Phrases
 | |
|    are numbered starting from zero.
 | |
| </p>
 | |
| </dd>
 | |
| <dt style="white-space:pre;font-family:monospace;font-size:120%" id="xInstCount">
 | |
| <b>int (*xInstCount)(Fts5Context*, int *pnInst)</b></dt><dd>
 | |
| <p style="margin-top:0.1em">
 | |
| Set *pnInst to the total number of occurrences of all phrases within
 | |
|    the query within the current row. Return SQLITE_OK if successful, or
 | |
|    an error code (i.e. SQLITE_NOMEM) if an error occurs.
 | |
| </p>
 | |
| 
 | |
| <p>
 | |
|    This API can be quite slow if used with an FTS5 table created with the
 | |
|    "detail=none" or "detail=column" option. If the FTS5 table is created 
 | |
|    with either "detail=none" or "detail=column" and "content=" option 
 | |
|    (i.e. if it is a contentless table), then this API always returns 0.
 | |
| </p>
 | |
| </dd>
 | |
| <dt style="white-space:pre;font-family:monospace;font-size:120%" id="xInst">
 | |
| <b>int (*xInst)(Fts5Context*, int iIdx, int *piPhrase, int *piCol, int *piOff)</b></dt><dd>
 | |
| <p style="margin-top:0.1em">
 | |
| Query for the details of phrase match iIdx within the current row.
 | |
|    Phrase matches are numbered starting from zero, so the iIdx argument
 | |
|    should be greater than or equal to zero and smaller than the value
 | |
|    output by xInstCount().
 | |
| </p>
 | |
| 
 | |
| <p>
 | |
|    Usually, output parameter *piPhrase is set to the phrase number, *piCol
 | |
|    to the column in which it occurs and *piOff the token offset of the
 | |
|    first token of the phrase. Returns SQLITE_OK if successful, or an error
 | |
|    code (i.e. SQLITE_NOMEM) if an error occurs.
 | |
| </p>
 | |
| 
 | |
| <p>
 | |
|    This API can be quite slow if used with an FTS5 table created with the
 | |
|    "detail=none" or "detail=column" option.
 | |
| </p>
 | |
| </dd>
 | |
| <dt style="white-space:pre;font-family:monospace;font-size:120%" id="xRowid">
 | |
| <b>sqlite3_int64 (*xRowid)(Fts5Context*)</b></dt><dd>
 | |
| <p style="margin-top:0.1em">
 | |
| Returns the rowid of the current row.
 | |
| </p>
 | |
| </dd>
 | |
| <dt style="white-space:pre;font-family:monospace;font-size:120%" id="xTokenize">
 | |
| <b>int (*xTokenize)(Fts5Context*, 
 | |
|     const char *pText, int nText, 
 | |
|     void *pCtx,                   
 | |
|     int (*xToken)(void*, int, const char*, int, int, int)       
 | |
| )</b></dt><dd>
 | |
| <p style="margin-top:0.1em">
 | |
| Tokenize text using the tokenizer belonging to the FTS5 table.
 | |
| </p>
 | |
| </dd>
 | |
| <dt style="white-space:pre;font-family:monospace;font-size:120%" id="xQueryPhrase">
 | |
| <b>int (*xQueryPhrase)(Fts5Context*, int iPhrase, void *pUserData,
 | |
|     int(*)(const Fts5ExtensionApi*,Fts5Context*,void*)
 | |
| )</b></dt><dd>
 | |
| <p style="margin-top:0.1em">
 | |
| This API function is used to query the FTS table for phrase iPhrase
 | |
|    of the current query. Specifically, a query equivalent to:
 | |
| </p>
 | |
| 
 | |
| <div class="codeblock"><pre>... FROM ftstable WHERE ftstable MATCH $p ORDER BY rowid
 | |
| </pre></div>
 | |
| 
 | |
| <p>
 | |
|    with $p set to a phrase equivalent to the phrase iPhrase of the
 | |
|    current query is executed. Any column filter that applies to
 | |
|    phrase iPhrase of the current query is included in $p. For each 
 | |
|    row visited, the callback function passed as the fourth argument 
 | |
|    is invoked. The context and API objects passed to the callback 
 | |
|    function may be used to access the properties of each matched row.
 | |
|    Invoking Api.xUserData() returns a copy of the pointer passed as 
 | |
|    the third argument to pUserData.
 | |
| </p>
 | |
| 
 | |
| <p>
 | |
|    If the callback function returns any value other than SQLITE_OK, the
 | |
|    query is abandoned and the xQueryPhrase function returns immediately.
 | |
|    If the returned value is SQLITE_DONE, xQueryPhrase returns SQLITE_OK.
 | |
|    Otherwise, the error code is propagated upwards.
 | |
| </p>
 | |
| 
 | |
| <p>
 | |
|    If the query runs to completion without incident, SQLITE_OK is returned.
 | |
|    Or, if some error occurs before the query completes or is aborted by
 | |
|    the callback, an SQLite error code is returned.
 | |
| </p>
 | |
| </dd>
 | |
| <dt style="white-space:pre;font-family:monospace;font-size:120%" id="xSetAuxdata">
 | |
| <b>int (*xSetAuxdata)(Fts5Context*, void *pAux, void(*xDelete)(void*))</b></dt><dd>
 | |
| <p style="margin-top:0.1em">
 | |
| Save the pointer passed as the second argument as the extension function's 
 | |
|    "auxiliary data". The pointer may then be retrieved by the current or any
 | |
|    future invocation of the same fts5 extension function made as part of
 | |
|    the same MATCH query using the xGetAuxdata() API.
 | |
| </p>
 | |
| 
 | |
| <p>
 | |
|    Each extension function is allocated a single auxiliary data slot for
 | |
|    each FTS query (MATCH expression). If the extension function is invoked 
 | |
|    more than once for a single FTS query, then all invocations share a 
 | |
|    single auxiliary data context.
 | |
| </p>
 | |
| 
 | |
| <p>
 | |
|    If there is already an auxiliary data pointer when this function is
 | |
|    invoked, then it is replaced by the new pointer. If an xDelete callback
 | |
|    was specified along with the original pointer, it is invoked at this
 | |
|    point.
 | |
| </p>
 | |
| 
 | |
| <p>
 | |
|    The xDelete callback, if one is specified, is also invoked on the
 | |
|    auxiliary data pointer after the FTS5 query has finished.
 | |
| </p>
 | |
| 
 | |
| <p>
 | |
|    If an error (e.g. an OOM condition) occurs within this function,
 | |
|    the auxiliary data is set to NULL and an error code returned. If the
 | |
|    xDelete parameter was not NULL, it is invoked on the auxiliary data
 | |
|    pointer before returning.
 | |
| </p>
 | |
| </dd>
 | |
| <dt style="white-space:pre;font-family:monospace;font-size:120%" id="xGetAuxdata">
 | |
| <b>void *(*xGetAuxdata)(Fts5Context*, int bClear)</b></dt><dd>
 | |
| <p style="margin-top:0.1em">
 | |
| Returns the current auxiliary data pointer for the fts5 extension 
 | |
|    function. See the xSetAuxdata() method for details.
 | |
| </p>
 | |
| 
 | |
| <p>
 | |
|    If the bClear argument is non-zero, then the auxiliary data is cleared
 | |
|    (set to NULL) before this function returns. In this case the xDelete,
 | |
|    if any, is not invoked.
 | |
| </p>
 | |
| </dd>
 | |
| <dt style="white-space:pre;font-family:monospace;font-size:120%" id="xRowCount">
 | |
| <b>int (*xRowCount)(Fts5Context*, sqlite3_int64 *pnRow)</b></dt><dd>
 | |
| <p style="margin-top:0.1em">
 | |
| This function is used to retrieve the total number of rows in the table.
 | |
|    In other words, the same value that would be returned by:
 | |
| </p>
 | |
| 
 | |
| <div class="codeblock"><pre>SELECT count(*) FROM ftstable;
 | |
| </pre></div>
 | |
| </dd>
 | |
| <dt style="white-space:pre;font-family:monospace;font-size:120%" id="xPhraseFirst">
 | |
| <b>int (*xPhraseFirst)(Fts5Context*, int iPhrase, Fts5PhraseIter*, int*, int*)</b></dt><dd>
 | |
| <p style="margin-top:0.1em">
 | |
| This function is used, along with type Fts5PhraseIter and the xPhraseNext
 | |
|    method, to iterate through all instances of a single query phrase within
 | |
|    the current row. This is the same information as is accessible via the
 | |
|    xInstCount/xInst APIs. While the xInstCount/xInst APIs are more convenient
 | |
|    to use, this API may be faster under some circumstances. To iterate 
 | |
|    through instances of phrase iPhrase, use the following code:
 | |
| </p>
 | |
| 
 | |
| <div class="codeblock"><pre>Fts5PhraseIter iter;
 | |
| int iCol, iOff;
 | |
| for(pApi->xPhraseFirst(pFts, iPhrase, &iter, &iCol, &iOff);
 | |
|     iCol>=0;
 | |
|     pApi->xPhraseNext(pFts, &iter, &iCol, &iOff)
 | |
| ){
 | |
|   // An instance of phrase iPhrase at offset iOff of column iCol
 | |
| }
 | |
| </pre></div>
 | |
| 
 | |
| <p>
 | |
|    The Fts5PhraseIter structure is defined above. Applications should not
 | |
|    modify this structure directly - it should only be used as shown above
 | |
|    with the xPhraseFirst() and xPhraseNext() API methods (and by
 | |
|    xPhraseFirstColumn() and xPhraseNextColumn() as illustrated below).
 | |
| </p>
 | |
| 
 | |
| <p>
 | |
|    This API can be quite slow if used with an FTS5 table created with the
 | |
|    "detail=none" or "detail=column" option. If the FTS5 table is created 
 | |
|    with either "detail=none" or "detail=column" and "content=" option 
 | |
|    (i.e. if it is a contentless table), then this API always iterates
 | |
|    through an empty set (all calls to xPhraseFirst() set iCol to -1).
 | |
| </p>
 | |
| </dd>
 | |
| <dt style="white-space:pre;font-family:monospace;font-size:120%" id="xPhraseNext">
 | |
| <b>void (*xPhraseNext)(Fts5Context*, Fts5PhraseIter*, int *piCol, int *piOff)</b></dt><dd>
 | |
| <p style="margin-top:0.1em">
 | |
| See xPhraseFirst above.
 | |
| </p>
 | |
| </dd>
 | |
| <dt style="white-space:pre;font-family:monospace;font-size:120%" id="xPhraseFirstColumn">
 | |
| <b>int (*xPhraseFirstColumn)(Fts5Context*, int iPhrase, Fts5PhraseIter*, int*)</b></dt><dd>
 | |
| <p style="margin-top:0.1em">
 | |
| This function and xPhraseNextColumn() are similar to the xPhraseFirst()
 | |
|    and xPhraseNext() APIs described above. The difference is that instead
 | |
|    of iterating through all instances of a phrase in the current row, these
 | |
|    APIs are used to iterate through the set of columns in the current row
 | |
|    that contain one or more instances of a specified phrase. For example:
 | |
| </p>
 | |
| 
 | |
| <div class="codeblock"><pre>Fts5PhraseIter iter;
 | |
| int iCol;
 | |
| for(pApi->xPhraseFirstColumn(pFts, iPhrase, &iter, &iCol);
 | |
|     iCol>=0;
 | |
|     pApi->xPhraseNextColumn(pFts, &iter, &iCol)
 | |
| ){
 | |
|   // Column iCol contains at least one instance of phrase iPhrase
 | |
| }
 | |
| </pre></div>
 | |
| 
 | |
| <p>
 | |
|    This API can be quite slow if used with an FTS5 table created with the
 | |
|    "detail=none" option. If the FTS5 table is created with either 
 | |
|    "detail=none" "content=" option (i.e. if it is a contentless table), 
 | |
|    then this API always iterates through an empty set (all calls to 
 | |
|    xPhraseFirstColumn() set iCol to -1).
 | |
| </p>
 | |
| 
 | |
| <p>
 | |
|    The information accessed using this API and its companion
 | |
|    xPhraseFirstColumn() may also be obtained using xPhraseFirst/xPhraseNext
 | |
|    (or xInst/xInstCount). The chief advantage of this API is that it is
 | |
|    significantly more efficient than those alternatives when used with
 | |
|    "detail=column" tables.
 | |
| </p>
 | |
| </dd>
 | |
| <dt style="white-space:pre;font-family:monospace;font-size:120%" id="xPhraseNextColumn">
 | |
| <b>void (*xPhraseNextColumn)(Fts5Context*, Fts5PhraseIter*, int *piCol)</b></dt><dd>
 | |
| <p style="margin-top:0.1em">
 | |
| See xPhraseFirstColumn above.
 | |
| </p>
 | |
| </dd>
 | |
| </dl>
 | |
| 
 | |
| 
 | |
| <a name="the_fts5vocab_virtual_table_module"></a>
 | |
| <h1 tags="fts5vocab" id="the_fts5vocab_virtual_table_module"><span>8. </span>The fts5vocab Virtual Table Module</h1>
 | |
| 
 | |
| <p> The fts5vocab virtual table module allows users to extract information from
 | |
| an FTS5 full-text index directly. The fts5vocab module is a part of FTS5 - it 
 | |
| is available whenever FTS5 is.
 | |
| 
 | |
| </p><p> Each fts5vocab table is associated with a single FTS5 table. An fts5vocab
 | |
| table is usually created by specifying two arguments in place of column names
 | |
| in the CREATE VIRTUAL TABLE statement - the name of the associated FTS5 table
 | |
| and the type of fts5vocab table. Currently there are three types of fts5vocab
 | |
| table; "row", "col" and "instance". Unless the fts5vocab table is created
 | |
| within the "temp" database, it must be part of the same database as the
 | |
| associated FTS5 table.
 | |
| 
 | |
| </p><div class="codeblock"><pre><i>-- Create an fts5vocab "row" table to query the full-text index belonging
 | |
| -- to FTS5 table "ft1".</i>
 | |
| CREATE VIRTUAL TABLE ft1_v USING fts5vocab('ft1', 'row');
 | |
| 
 | |
| <i>-- Create an fts5vocab "col" table to query the full-text index belonging
 | |
| -- to FTS5 table "ft2".</i>
 | |
| CREATE VIRTUAL TABLE ft2_v USING fts5vocab(ft2, col);
 | |
| 
 | |
| <i>-- Create an fts5vocab "instance" table to query the full-text index
 | |
| -- belonging to FTS5 table "ft3".</i>
 | |
| CREATE VIRTUAL TABLE ft3_v USING fts5vocab(ft3, instance);
 | |
| </pre></div>
 | |
| 
 | |
| <p> If an fts5vocab table is created in the temp database, it may be associated
 | |
| with an FTS5 table in any attached database. In order to attach the fts5vocab
 | |
| table to an FTS5 table located in a database other than "temp", the name of the
 | |
| database is inserted before the FTS5 table name in the CREATE VIRTUAL TABLE 
 | |
| arguments. For example:
 | |
| 
 | |
| </p><div class="codeblock"><pre><i>-- Create an fts5vocab "row" table to query the full-text index belonging
 | |
| -- to FTS5 table "ft1" in database "main".</i>
 | |
| CREATE VIRTUAL TABLE temp.ft1_v USING fts5vocab(main, 'ft1', 'row');
 | |
| 
 | |
| <i>-- Create an fts5vocab "col" table to query the full-text index belonging
 | |
| -- to FTS5 table "ft2" in attached database "aux".</i>
 | |
| CREATE VIRTUAL TABLE temp.ft2_v USING fts5vocab('aux', ft2, col);
 | |
| 
 | |
| <i>-- Create an fts5vocab "instance" table to query the full-text index 
 | |
| -- belonging to FTS5 table "ft3" in attached database "other".</i>
 | |
| CREATE VIRTUAL TABLE temp.ft2_v USING fts5vocab('aux', ft3, 'instance');
 | |
| </pre></div>
 | |
| 
 | |
| <p> Specifying three arguments when creating an fts5vocab table in any database
 | |
| other than "temp" results in an error.
 | |
| 
 | |
| </p><p> An fts5vocab table of type "row" contains one row for each distinct term
 | |
| in the associated FTS5 table. The table columns are as follows:
 | |
| 
 | |
| </p><table striped="1" style="margin:1em auto; width:80%; border-spacing:0">
 | |
|   <tr style="text-align:left"><th>Column</th><th>Contents
 | |
|   </th></tr><tr style="text-align:left;background-color:#DDDDDD"><td>term</td><td> The term, as stored in the FTS5 index.
 | |
|   </td></tr><tr style="text-align:left"><td>doc</td><td>  The number of rows that contain at least one instance of the term.
 | |
|   </td></tr><tr style="text-align:left;background-color:#DDDDDD"><td>cnt</td><td>  The total number of instances of the term in the entire FTS5 table.
 | |
| </td></tr></table>
 | |
| 
 | |
| <p> An fts5vocab table of type "col" contains one row for each distinct term/column
 | |
| combination in the associated FTS5 table. Table columns are as follows:
 | |
| 
 | |
| </p><table striped="1" style="margin:1em auto; width:80%; border-spacing:0">
 | |
|   <tr style="text-align:left"><th>Column</th><th>Contents
 | |
|   </th></tr><tr style="text-align:left;background-color:#DDDDDD"><td>term</td><td> The term, as stored in the FTS5 index.
 | |
|   </td></tr><tr style="text-align:left"><td>col</td><td>  The name of the FTS5 table column that contains the term.
 | |
|   </td></tr><tr style="text-align:left;background-color:#DDDDDD"><td>doc</td><td>  The number of rows in the FTS5 table for which column $col
 | |
|                    contains at least one instance of the term.
 | |
|   </td></tr><tr style="text-align:left"><td>cnt</td><td>  The total number of instances of the term that appear in
 | |
|                    column $col of the FTS5 table (considering all rows). 
 | |
| </td></tr></table>
 | |
| 
 | |
| <p> An fts5vocab table of type "instance" contains one row for each term
 | |
| instance stored in the associated FTS index. Assuming the FTS5 table is
 | |
| created with the 'detail' option set to 'full', table columns are as follows:
 | |
| 
 | |
| </p><table striped="1" style="margin:1em auto; width:80%; border-spacing:0">
 | |
|   <tr style="text-align:left"><th>Column</th><th>Contents
 | |
|   </th></tr><tr style="text-align:left;background-color:#DDDDDD"><td>term</td><td>   The term, as stored in the FTS5 index.
 | |
|   </td></tr><tr style="text-align:left"><td>doc</td><td>    The rowid of the document that contains the term instance.
 | |
|   </td></tr><tr style="text-align:left;background-color:#DDDDDD"><td>col</td><td>    The name of the column that contains the term instance.
 | |
|   </td></tr><tr style="text-align:left"><td>offset</td><td> The index of the term instance within its column. Terms 
 | |
|                      are numbered in order of occurrence starting from 0.
 | |
| </td></tr></table>
 | |
| 
 | |
| <p> If the FTS5 table is created with the 'detail' option set to 'col', then
 | |
| the <i>offset</i> column of an instance virtual table always contains NULL.
 | |
| In this case there is one row in the table for each unique term/doc/col 
 | |
| combination. Or, if the FTS5 table is created with 'detail' set to 'none',
 | |
| then both <i>offset</i> and <i>col</i> always contain NULL values. For
 | |
| detail=none FTS5 tables, there is one row in the fts5vocab table for each
 | |
| unique term/doc combination.
 | |
| 
 | |
| </p><p>Example:
 | |
| 
 | |
| </p><div class="codeblock"><pre><i>-- Assuming a database created using:</i>
 | |
| CREATE VIRTUAL TABLE ft1 USING fts5(c1, c2);
 | |
| INSERT INTO ft1 VALUES('apple banana cherry', 'banana banana cherry');
 | |
| INSERT INTO ft1 VALUES('cherry cherry cherry', 'date date date');
 | |
| 
 | |
| <i>-- Then querying the following fts5vocab table (type "col") returns:
 | |
| --
 | |
| --    apple  | c1 | 1 | 1
 | |
| --    banana | c1 | 1 | 1
 | |
| --    banana | c2 | 1 | 2
 | |
| --    cherry | c1 | 2 | 4
 | |
| --    cherry | c2 | 1 | 1
 | |
| --    date   | c3 | 1 | 3
 | |
| --</i>
 | |
| CREATE VIRTUAL TABLE ft1_v_col USING fts5vocab(ft1, col);
 | |
| 
 | |
| <i>-- Querying an fts5vocab table of type "row" returns:
 | |
| --
 | |
| --    apple  | 1 | 1
 | |
| --    banana | 1 | 3
 | |
| --    cherry | 2 | 5
 | |
| --    date   | 1 | 3
 | |
| --</i>
 | |
| CREATE VIRTUAL TABLE ft1_v_row USING fts5vocab(ft1, row);
 | |
| 
 | |
| <i>-- And, for type "instance"
 | |
| INSERT INTO ft1 VALUES('apple banana cherry', 'banana banana cherry');
 | |
| INSERT INTO ft1 VALUES('cherry cherry cherry', 'date date date');
 | |
| --
 | |
| --    apple  | 1 | c1 | 0
 | |
| --    banana | 1 | c1 | 1
 | |
| --    banana | 1 | c2 | 0
 | |
| --    banana | 1 | c2 | 1
 | |
| --    cherry | 1 | c1 | 2
 | |
| --    cherry | 1 | c2 | 2
 | |
| --    cherry | 2 | c1 | 0
 | |
| --    cherry | 2 | c1 | 1
 | |
| --    cherry | 2 | c1 | 2
 | |
| --    date   | 2 | c2 | 0
 | |
| --    date   | 2 | c2 | 1
 | |
| --    date   | 2 | c2 | 2
 | |
| --</i>
 | |
| CREATE VIRTUAL TABLE ft1_v_instance USING fts5vocab(ft1, instance);
 | |
| </pre></div>
 | |
| 
 | |
| 
 | |
| <a name="appendix_a"></a>
 | |
| <h1 id="appendix_a" nonumber="1" tags="comparison with fts4">
 | |
|   Appendix A: Comparison with FTS3/4
 | |
| </h1>
 | |
| 
 | |
| <p> Also available is the similar but more mature <a href="fts3.html">FTS3/4</a> module. 
 | |
| FTS5 is a new version of FTS4 that includes various fixes and solutions for 
 | |
| problems that could not be fixed in FTS4 without sacrificing backwards 
 | |
| compatibility. Some of these problems are 
 | |
| <a href="fts5.html#_summary_of_technical_differences_">described below</a>.
 | |
| 
 | |
| </p><h2 nonumber="1" id="_application_porting_guide_"> Application Porting Guide </h2>
 | |
| 
 | |
| <p> In order to use FTS5 instead of FTS3 or FTS4, applications usually require
 | |
| minimal modifications. Most of these fall into three categories - changes
 | |
| required to the CREATE VIRTUAL TABLE statement used to create the FTS table,
 | |
| changes required to SELECT queries used to execute queries against the table,
 | |
| and changes required to applications that use <a href="fts3.html#snippet">FTS auxiliary functions</a>.
 | |
| 
 | |
| </p><h3 nonumber="1" id="_changes_to_create_virtual_table_statements_"> Changes to CREATE VIRTUAL TABLE statements </h3>
 | |
| 
 | |
| <ol>
 | |
| <li> <p>The module name must be changed from "fts3" or "fts4" to "fts5".
 | |
| 
 | |
| </p></li><li> <p>All type information or constraint specifications must be removed from
 | |
|      column definitions. FTS3/4 ignores everything following the column name in
 | |
|      a column definition, FTS5 attempts to parse it (and will report an error
 | |
|      if it fails to).
 | |
| 
 | |
| </p></li><li> <p>The "matchinfo=fts3" option is not available. The 
 | |
|      <a href="fts5.html#the_columnsize_option">"columnsize=0"</a> option is equivalent.
 | |
| 
 | |
| </p></li><li> <p>The notindexed= option is not available. Adding <a href="fts5.html#the_unindexed_column_option">UNINDEXED</a>
 | |
|      to the column definition is equivalent.
 | |
| 
 | |
| </p></li><li> <p>The ICU tokenizer is not available.
 | |
| 
 | |
| </p></li><li> <p>The compress=, uncompress= and languageid= options are not available.
 | |
|      There is as of yet no equivalent for their functionality.
 | |
| </p></li></ol>
 | |
| 
 | |
| <div class="codeblock"><pre><i> -- FTS3/4 statement </i>
 | |
| CREATE VIRTUAL TABLE t1 USING fts4(
 | |
|   linkid INTEGER,
 | |
|   header CHAR(20),
 | |
|   text VARCHAR,
 | |
|   notindexed=linkid,
 | |
|   matchinfo=fts3,
 | |
|   tokenizer=unicode61
 | |
| );
 | |
| 
 | |
| <i> -- FTS5 equivalent (note - the "tokenizer=unicode61" option is not</i>
 | |
| <i> -- required as this is the default for FTS5 anyway)</i>
 | |
| CREATE VIRTUAL TABLE t1 USING fts5(
 | |
|   linkid UNINDEXED,
 | |
|   header,
 | |
|   text,
 | |
|   columnsize=0
 | |
| );
 | |
| </pre></div>
 | |
| 
 | |
| <h3 nonumber="1" id="_changes_to_select_statements_"> Changes to SELECT statements </h3>
 | |
| 
 | |
| <ol>
 | |
|   <li> <p>The "docid" alias does not exist. Applications must use "rowid"
 | |
|           instead.
 | |
| 
 | |
|   </p></li><li> <p>The behaviour of queries when a column-filter is specified both as
 | |
|           part of the FTS query and by using a column as the LHS of a MATCH
 | |
|           operator is slightly different. For a table with columns "a" and "b"
 | |
|           and a query similar to:
 | |
| </p><div class="codeblock"><pre>... a MATCH 'b: string'
 | |
| </pre></div>
 | |
|        <p>FTS3/4 searches for matches in column "b". However, FTS5 always
 | |
|           returns zero rows, as results are first filtered for column "b", then
 | |
|           for column "a", leaving no results. In other words, in FTS3/4 the
 | |
|           inner filter overrides the outer, in FTS5 both filters are applied.
 | |
| 
 | |
|   </p></li><li> <p>The FTS query syntax (right hand side of the MATCH operator) has
 | |
|           changed in some ways. The FTS5 syntax is quite close to the FTS4
 | |
|           "enhanced syntax". The main difference is that FTS5 is fussier 
 | |
|           about unrecognized punctuation characters and similar within query
 | |
|           strings. Most queries that work with FTS3/4 should also work with
 | |
|           FTS5, and those that do not should return parse errors.
 | |
| </p></li></ol>
 | |
| 
 | |
| <h3 nonumber="1" id="_auxiliary_function_changes_"> Auxiliary Function Changes </h3>
 | |
| 
 | |
| <p> FTS5 has no matchinfo() or offsets() function, and the snippet() function
 | |
| is not as fully-featured as in FTS3/4. However, since FTS5 does provide 
 | |
| an API allowing applications to create <a href="fts5.html#_custom_auxiliary_functions_api_reference_">custom auxiliary functions</a>, any
 | |
| required functionality may be implemented within the application code.
 | |
| 
 | |
| </p><p> The set of built-in auxiliary functions provided by FTS5 may be 
 | |
| improved upon in the future.
 | |
| 
 | |
| </p><h3 nonumber="1" id="_other_issues"> Other Issues</h3>
 | |
| 
 | |
| <ol>
 | |
|   <li><p> The functionality provided by the fts4aux module is now provided
 | |
|           by <a href="fts5.html#the_fts5vocab_virtual_table_module">fts5vocab</a>. The schema of these two tables is slightly different.
 | |
| 
 | |
|   </p></li><li><p> The FTS3/4 "merge=X,Y" command has been replaced by the 
 | |
|           <a href="fts5.html#the_merge_command">FTS5 merge command</a>.
 | |
| 
 | |
|   </p></li><li><p> The FTS3/4 "automerge=X" command has been replaced by the 
 | |
|           <a href="fts5.html#the_automerge_configuration_option">FTS5 automerge option</a>.
 | |
| </p></li></ol>
 | |
| 
 | |
| <a name="_summary_of_technical_differences_"></a>
 | |
| <h2 nonumber="1" tags="fts5 technical differences" id="_summary_of_technical_differences_"> 
 | |
|   Summary of Technical Differences 
 | |
| </h2>
 | |
| 
 | |
| <p>FTS5 is similar to FTS3/4 in that the primary task of each is to maintain
 | |
| an index mapping from each unique token to a list of instances of that token 
 | |
| within a set of documents, where each instance is identified by the document 
 | |
| in which it appears and its position within that document. For example:
 | |
| 
 | |
| </p><div class="codeblock"><pre><i>-- Given the following SQL:</i>
 | |
| CREATE VIRTUAL TABLE ft USING fts5(a, b);
 | |
| INSERT INTO ft(rowid, a, b) VALUES(1, 'X Y', 'Y Z');
 | |
| INSERT INTO ft(rowid, a, b) VALUES(2, 'A Z', 'Y Y');
 | |
| 
 | |
| <i>-- The FTS5 module creates the following mapping on disk:</i>
 | |
| A --> (2, 0, 0)
 | |
| X --> (1, 0, 0)
 | |
| Y --> (1, 0, 1) (1, 1, 0) (2, 1, 0) (2, 1, 1)
 | |
| Z --> (1, 1, 1) (2, 0, 1)
 | |
| </pre></div>
 | |
| 
 | |
| <p>In the example above, each triple identifies the location of a token
 | |
| instance by rowid, column number (columns are numbered sequentially
 | |
| starting at 0 from left to right) and position within the column value (the
 | |
| first token in a column value is 0, the second is 1, and so on). Using this
 | |
| index, FTS5 is able to provide timely answers to queries such as "the set
 | |
| of all documents that contain the token 'A'", or "the set of all documents
 | |
| that contain the sequence 'Y Z'". The list of instances associated with a
 | |
| single token is called an "instance-list".
 | |
| 
 | |
| </p><p>The principle difference between FTS3/4 and FTS5 is that in FTS3/4,
 | |
| each instance-list is stored as a single large database record, whereas
 | |
| in FTS5 large instance-lists are divided between multiple database records.
 | |
| This has the following implications for dealing with large databases that
 | |
| contain large lists:
 | |
| 
 | |
| </p><ul>
 | |
|   <li> <p>FTS5 is able to load instance-lists into memory incrementally in
 | |
|        order to reduce memory usage and peak allocation size. FTS3/4 very
 | |
|        often loads entire instance-lists into memory.
 | |
| 
 | |
|   </p></li><li> <p>When processing queries that feature more than one token, FTS5 is
 | |
|        sometimes able to determine that the query can be answered by
 | |
|        inspecting a subset of a large instance-list. FTS3/4 almost always
 | |
|        has to traverse entire instance-lists.
 | |
| 
 | |
|   </p></li><li> If an instance-list grows so large that it exceeds
 | |
|        the <a href="limits.html#max_length">SQLITE_MAX_LENGTH</a> limit, FTS3/4 is unable to handle it. FTS5
 | |
|        does not have this problem. 
 | |
| </li></ul>
 | |
| 
 | |
| <p>For these reasons, many complex queries may use less memory and run faster 
 | |
| using FTS5.
 | |
| 
 | |
| </p><p>Some other ways in which FTS5 differs from FTS3/4 are:
 | |
| 
 | |
| </p><ul>
 | |
|   <li> <p>FTS5 supports "ORDER BY rank" for returning results in order of
 | |
|        decreasing relevancy.
 | |
| 
 | |
|   </p></li><li> <p>FTS5 features an API allowing users to create custom auxiliary 
 | |
|        functions for advanced ranking and text processing applications. The
 | |
|        special "rank" column may be mapped to a custom auxiliary function
 | |
|        so that adding "ORDER BY rank" to a query works as expected.
 | |
| 
 | |
|   </p></li><li> <p>FTS5 recognizes unicode separator characters and case equivalence by
 | |
|        default. This is also possible using FTS3/4, but must be explicitly
 | |
|        enabled.
 | |
| 
 | |
|   </p></li><li> <p>The query syntax has been revised where necessary to remove
 | |
|        ambiguities and to make it possible to escape special characters
 | |
|        in query terms.
 | |
| 
 | |
|   </p></li><li> <p>By default, FTS3/4 occasionally merges together two or more of the
 | |
|        b-trees that make up its full-text index within an INSERT, UPDATE or
 | |
|        DELETE statement executed by the user. This means that any operation
 | |
|        on an FTS3/4 table may turn out to be surprisingly slow, as FTS3/4 
 | |
|        may unpredictably choose to merge together two or more large b-trees
 | |
|        within it. FTS5 uses incremental merging by default, which limits
 | |
|        the amount of processing that may take place within any given 
 | |
|        INSERT, UPDATE or DELETE operation.
 | |
| </p></li></ul>
 | |
| 
 | |
| <a name="appendix_b"></a>
 | |
| <h1 id="appendix_b" nonumber="1" tags="fts5 shadow tables">
 | |
|   Appendix B: Shadow tables created by FTS5
 | |
| </h1>
 | |
| 
 | |
| <p>
 | |
| When an FTS5 virtual table is created in a database, between 3 and 5 real
 | |
| tables are created in the database. These are known as "<a href="vtab.html#xshadowname">shadow tables</a>", and are
 | |
| used by the virtual table module to store persistent data. They should not
 | |
| be accessed directly by the user. Many other virtual table modules, including
 | |
| <a href="fts3.html">FTS3</a> and <a href="rtree.html">rtree</a>, also create and use shadow tables.
 | |
| 
 | |
| </p><p>FTS5 creates the following shadow tables. In each case the actual table name
 | |
| is based on the name of the FTS5 virtual table (in the following table, replace
 | |
| <name> with the name of the virtual table to find the actual shadow
 | |
| table name).
 | |
| 
 | |
| </p><table striped="1" style="margin:1em auto; width:80%; border-spacing:0">
 | |
| <tr style="text-align:left"><th>Table Name</th><th>Contents
 | |
| </th></tr><tr style="text-align:left;background-color:#DDDDDD"><td><name>_data</td><td> This table contains most of the full-text index data.
 | |
| </td></tr><tr style="text-align:left"><td><name>_idx</td><td> This table contains the remainder of the full-text
 | |
| index data. It is almost always much smaller than the <name>_data table.
 | |
| </td></tr><tr style="text-align:left;background-color:#DDDDDD"><td><name>_config</td><td> Contains the values of persistent
 | |
| configuration parameters.
 | |
| </td></tr><tr style="text-align:left"><td><name>_content</td><td> Contains the actual data inserted into the 
 | |
| FTS5 table. This shadow table is not present for 
 | |
| <a href="fts5.html#contentless_tables">contentless</a> or 
 | |
| <a href="fts5.html#external_content_tables">external content</a> FTS5 tables.
 | |
| </td></tr><tr style="text-align:left;background-color:#DDDDDD"><td><name>_docsize</td><td> Contains the size of each column of each
 | |
| row in the virtual table in tokens. This shadow table is not present if 
 | |
| the <a href="fts5.html#the_columnsize_option">"columnsize" option</a> is set to 0.
 | |
| </td></tr></table>
 | |
| 
 |