2165 lines
		
	
	
		
			106 KiB
		
	
	
	
		
			HTML
		
	
	
	
	
	
			
		
		
	
	
			2165 lines
		
	
	
		
			106 KiB
		
	
	
	
		
			HTML
		
	
	
	
	
	
| <!DOCTYPE html>
 | |
| <html><head>
 | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0">
 | |
| <meta http-equiv="content-type" content="text/html; charset=UTF-8">
 | |
| <link href="sqlite.css" rel="stylesheet">
 | |
| <title>Database File Format</title>
 | |
| <!-- path= -->
 | |
| </head>
 | |
| <body>
 | |
| <div class=nosearch>
 | |
| <a href="index.html">
 | |
| <img class="logo" src="images/sqlite370_banner.gif" alt="SQLite" border="0">
 | |
| </a>
 | |
| <div><!-- IE hack to prevent disappearing logo --></div>
 | |
| <div class="tagline desktoponly">
 | |
| Small. Fast. Reliable.<br>Choose any three.
 | |
| </div>
 | |
| <div class="menu mainmenu">
 | |
| <ul>
 | |
| <li><a href="index.html">Home</a>
 | |
| <li class='mobileonly'><a href="javascript:void(0)" onclick='toggle_div("submenu")'>Menu</a>
 | |
| <li class='wideonly'><a href='about.html'>About</a>
 | |
| <li class='desktoponly'><a href="docs.html">Documentation</a>
 | |
| <li class='desktoponly'><a href="download.html">Download</a>
 | |
| <li class='wideonly'><a href='copyright.html'>License</a>
 | |
| <li class='desktoponly'><a href="support.html">Support</a>
 | |
| <li class='desktoponly'><a href="prosupport.html">Purchase</a>
 | |
| <li class='search' id='search_menubutton'>
 | |
| <a href="javascript:void(0)" onclick='toggle_search()'>Search</a>
 | |
| </ul>
 | |
| </div>
 | |
| <div class="menu submenu" id="submenu">
 | |
| <ul>
 | |
| <li><a href='about.html'>About</a>
 | |
| <li><a href='docs.html'>Documentation</a>
 | |
| <li><a href='download.html'>Download</a>
 | |
| <li><a href='support.html'>Support</a>
 | |
| <li><a href='prosupport.html'>Purchase</a>
 | |
| </ul>
 | |
| </div>
 | |
| <div class="searchmenu" id="searchmenu">
 | |
| <form method="GET" action="search">
 | |
| <select name="s" id="searchtype">
 | |
| <option value="d">Search Documentation</option>
 | |
| <option value="c">Search Changelog</option>
 | |
| </select>
 | |
| <input type="text" name="q" id="searchbox" value="">
 | |
| <input type="submit" value="Go">
 | |
| </form>
 | |
| </div>
 | |
| </div>
 | |
| <script>
 | |
| function toggle_div(nm) {
 | |
| var w = document.getElementById(nm);
 | |
| if( w.style.display=="block" ){
 | |
| w.style.display = "none";
 | |
| }else{
 | |
| w.style.display = "block";
 | |
| }
 | |
| }
 | |
| function toggle_search() {
 | |
| var w = document.getElementById("searchmenu");
 | |
| if( w.style.display=="block" ){
 | |
| w.style.display = "none";
 | |
| } else {
 | |
| w.style.display = "block";
 | |
| setTimeout(function(){
 | |
| document.getElementById("searchbox").focus()
 | |
| }, 30);
 | |
| }
 | |
| }
 | |
| function div_off(nm){document.getElementById(nm).style.display="none";}
 | |
| window.onbeforeunload = function(e){div_off("submenu");}
 | |
| /* Disable the Search feature if we are not operating from CGI, since */
 | |
| /* Search is accomplished using CGI and will not work without it. */
 | |
| if( !location.origin || !location.origin.match || !location.origin.match(/http/) ){
 | |
| document.getElementById("search_menubutton").style.display = "none";
 | |
| }
 | |
| /* Used by the Hide/Show button beside syntax diagrams, to toggle the */
 | |
| function hideorshow(btn,obj){
 | |
| var x = document.getElementById(obj);
 | |
| var b = document.getElementById(btn);
 | |
| if( x.style.display!='none' ){
 | |
| x.style.display = 'none';
 | |
| b.innerHTML='show';
 | |
| }else{
 | |
| x.style.display = '';
 | |
| b.innerHTML='hide';
 | |
| }
 | |
| return false;
 | |
| }
 | |
| </script>
 | |
| </div>
 | |
| <div class=fancy>
 | |
| <div class=nosearch>
 | |
| <div class="fancy_title">
 | |
| Database File Format
 | |
| </div>
 | |
| <div class="fancy_toc">
 | |
| <a onclick="toggle_toc()">
 | |
| <span class="fancy_toc_mark" id="toc_mk">►</span>
 | |
| Table Of Contents
 | |
| </a>
 | |
| <div id="toc_sub"><div class="fancy-toc1"><a href="#the_database_file">1. The Database File</a></div>
 | |
| <div class="fancy-toc2"><a href="#hot_journals">1.1. Hot Journals</a></div>
 | |
| <div class="fancy-toc2"><a href="#pages">1.2. Pages</a></div>
 | |
| <div class="fancy-toc2"><a href="#the_database_header">1.3. The Database Header</a></div>
 | |
| <div class="fancy-toc3"><a href="#magic_header_string">1.3.1. Magic Header String</a></div>
 | |
| <div class="fancy-toc3"><a href="#page_size">1.3.2. Page Size</a></div>
 | |
| <div class="fancy-toc3"><a href="#file_format_version_numbers">1.3.3. File format version numbers</a></div>
 | |
| <div class="fancy-toc3"><a href="#reserved_bytes_per_page">1.3.4. Reserved bytes per page</a></div>
 | |
| <div class="fancy-toc3"><a href="#payload_fractions">1.3.5. Payload fractions</a></div>
 | |
| <div class="fancy-toc3"><a href="#file_change_counter">1.3.6. File change counter</a></div>
 | |
| <div class="fancy-toc3"><a href="#in_header_database_size">1.3.7. In-header database size</a></div>
 | |
| <div class="fancy-toc3"><a href="#free_page_list">1.3.8. Free page list</a></div>
 | |
| <div class="fancy-toc3"><a href="#schema_cookie">1.3.9. Schema cookie</a></div>
 | |
| <div class="fancy-toc3"><a href="#schema_format_number">1.3.10. Schema format number</a></div>
 | |
| <div class="fancy-toc3"><a href="#suggested_cache_size">1.3.11. Suggested cache size</a></div>
 | |
| <div class="fancy-toc3"><a href="#incremental_vacuum_settings">1.3.12. Incremental vacuum settings</a></div>
 | |
| <div class="fancy-toc3"><a href="#text_encoding">1.3.13. Text encoding</a></div>
 | |
| <div class="fancy-toc3"><a href="#user_version_number">1.3.14. User version number</a></div>
 | |
| <div class="fancy-toc3"><a href="#application_id">1.3.15. Application ID</a></div>
 | |
| <div class="fancy-toc3"><a href="#write_library_version_number_and_version_valid_for_number">1.3.16. Write library version number and version-valid-for number</a></div>
 | |
| <div class="fancy-toc3"><a href="#header_space_reserved_for_expansion">1.3.17. Header space reserved for expansion</a></div>
 | |
| <div class="fancy-toc2"><a href="#the_lock_byte_page">1.4. The Lock-Byte Page</a></div>
 | |
| <div class="fancy-toc2"><a href="#the_freelist">1.5. The Freelist</a></div>
 | |
| <div class="fancy-toc2"><a href="#b_tree_pages">1.6. B-tree Pages</a></div>
 | |
| <div class="fancy-toc2"><a href="#cell_payload_overflow_pages">1.7. Cell Payload Overflow Pages</a></div>
 | |
| <div class="fancy-toc2"><a href="#pointer_map_or_ptrmap_pages">1.8. Pointer Map or Ptrmap Pages</a></div>
 | |
| <div class="fancy-toc1"><a href="#schema_layer">2. Schema Layer</a></div>
 | |
| <div class="fancy-toc2"><a href="#record_format">2.1. Record Format</a></div>
 | |
| <div class="fancy-toc2"><a href="#record_sort_order">2.2. Record Sort Order</a></div>
 | |
| <div class="fancy-toc2"><a href="#representation_of_sql_tables">2.3. Representation Of SQL Tables</a></div>
 | |
| <div class="fancy-toc2"><a href="#representation_of_without_rowid_tables">2.4. Representation of WITHOUT ROWID Tables</a></div>
 | |
| <div class="fancy-toc3"><a href="#suppression_of_redundant_columns_in_the_primary_key_of_without_rowid_tables">2.4.1. Suppression of redundant columns in the PRIMARY KEY
 | |
| of WITHOUT ROWID tables</a></div>
 | |
| <div class="fancy-toc2"><a href="#representation_of_sql_indices">2.5. Representation Of SQL Indices</a></div>
 | |
| <div class="fancy-toc3"><a href="#suppression_of_redundant_columns_in_without_rowid_secondary_indexes_">2.5.1. Suppression of redundant columns in WITHOUT ROWID secondary indexes
 | |
| </a></div>
 | |
| <div class="fancy-toc2"><a href="#storage_of_the_sql_database_schema">2.6. Storage Of The SQL Database Schema</a></div>
 | |
| <div class="fancy-toc3"><a href="#alternative_names_for_the_schema_table">2.6.1. Alternative Names For The Schema Table</a></div>
 | |
| <div class="fancy-toc3"><a href="#internal_schema_objects">2.6.2. Internal Schema Objects</a></div>
 | |
| <div class="fancy-toc3"><a href="#the_sqlite_sequence_table">2.6.3. The sqlite_sequence table</a></div>
 | |
| <div class="fancy-toc3"><a href="#the_sqlite_stat1_table">2.6.4. The sqlite_stat1 table</a></div>
 | |
| <div class="fancy-toc3"><a href="#the_sqlite_stat2_table">2.6.5. The sqlite_stat2 table</a></div>
 | |
| <div class="fancy-toc3"><a href="#the_sqlite_stat3_table">2.6.6. The sqlite_stat3 table</a></div>
 | |
| <div class="fancy-toc3"><a href="#the_sqlite_stat4_table">2.6.7. The sqlite_stat4 table</a></div>
 | |
| <div class="fancy-toc1"><a href="#the_rollback_journal">3. The Rollback Journal</a></div>
 | |
| <div class="fancy-toc1"><a href="#the_write_ahead_log">4. The Write-Ahead Log</a></div>
 | |
| <div class="fancy-toc2"><a href="#wal_file_format">4.1. WAL File Format</a></div>
 | |
| <div class="fancy-toc2"><a href="#checksum_algorithm">4.2. Checksum Algorithm</a></div>
 | |
| <div class="fancy-toc2"><a href="#checkpoint_algorithm">4.3. Checkpoint Algorithm</a></div>
 | |
| <div class="fancy-toc2"><a href="#wal_reset">4.4. WAL Reset</a></div>
 | |
| <div class="fancy-toc2"><a href="#reader_algorithm">4.5. Reader Algorithm</a></div>
 | |
| <div class="fancy-toc2"><a href="#wal_index_format">4.6. WAL-Index Format</a></div>
 | |
| </div>
 | |
| </div>
 | |
| <script>
 | |
| function toggle_toc(){
 | |
| var sub = document.getElementById("toc_sub")
 | |
| var mk = document.getElementById("toc_mk")
 | |
| if( sub.style.display!="block" ){
 | |
| sub.style.display = "block";
 | |
| mk.innerHTML = "▼";
 | |
| } else {
 | |
| sub.style.display = "none";
 | |
| mk.innerHTML = "►";
 | |
| }
 | |
| }
 | |
| </script>
 | |
| </div>
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| <p>This document describes and defines the on-disk database file
 | |
| format used by all releases of SQLite since 
 | |
| version 3.0.0 (2004-06-18).</p>
 | |
| 
 | |
| <h1 id="the_database_file"><span>1. </span>The Database File</h1>
 | |
| 
 | |
| <p>The complete state of an SQLite database is usually
 | |
| contained in a single file on disk called the "main database file".</p>
 | |
| 
 | |
| <p>During a transaction, SQLite stores additional information 
 | |
| in a second file called the "rollback journal", or if SQLite is in
 | |
| <a href="wal.html">WAL mode</a>, a write-ahead log file.
 | |
| 
 | |
| <a name="hotjrnl"></a>
 | |
| 
 | |
| </p><h2 id="hot_journals"><span>1.1. </span>Hot Journals</h2>
 | |
| <p>If the application or
 | |
| host computer crashes before the transaction completes, then the rollback
 | |
| journal or write-ahead log contains information needed 
 | |
| to restore the main database file to a consistent state.  When a rollback 
 | |
| journal or write-ahead log contains information necessary for recovering 
 | |
| the state of the database, they are called a "hot journal" or "hot WAL file".
 | |
| Hot journals and WAL files are only a factor during error recovery
 | |
| scenarios and so are uncommon, but they are part of the state of an SQLite
 | |
| database and so cannot be ignored.  This document defines the format
 | |
| of a rollback journal and the write-ahead log file, but the focus is
 | |
| on the main database file.</p>
 | |
| 
 | |
| <h2 id="pages"><span>1.2. </span>Pages</h2>
 | |
| 
 | |
| <p>The main database file consists of one or more pages.  The size of a
 | |
| page is a power of two between 512 and 65536 inclusive.  All pages within
 | |
| the same database are the same size.  The page size for a database file
 | |
| is determined by the 2-byte integer located at an offset of
 | |
| 16 bytes from the beginning of the database file.</p>
 | |
| 
 | |
| <p>Pages are numbered beginning with 1.  The maximum page number is
 | |
| 4294967294 (2<sup><small>32</small></sup> - 2).  The minimum size
 | |
| SQLite database is a single 512-byte page.
 | |
| The maximum size database would be 2147483646 pages at 65536 bytes per
 | |
| page or 281,474,976,579,584 bytes (about 281 terabytes).  Usually SQLite will
 | |
| hit the maximum file size limit of the underlying filesystem or disk
 | |
| hardware long before it hits its own internal size limit.</p>
 | |
| 
 | |
| <p>In common use, SQLite databases tend to range in size from a few kilobytes
 | |
| to a few gigabytes, though terabyte-size SQLite databases are known to exist
 | |
| in production.</p>
 | |
| 
 | |
| <p>At any point in time, every page in the main database has a single
 | |
| use which is one of the following:
 | |
| </p><ul>
 | |
| <li>The lock-byte page
 | |
| </li><li>A freelist page
 | |
| <ul>
 | |
| <li>A freelist trunk page
 | |
| </li><li>A freelist leaf page
 | |
| </li></ul>
 | |
| </li><li>A b-tree page
 | |
| <ul>
 | |
| <li>A table b-tree interior page
 | |
| </li><li>A table b-tree leaf page
 | |
| </li><li>An index b-tree interior page
 | |
| </li><li>An index b-tree leaf page
 | |
| </li></ul>
 | |
| </li><li>A payload overflow page
 | |
| </li><li>A pointer map page
 | |
| </li></ul>
 | |
| 
 | |
| 
 | |
| <p>All reads from and writes to the main database file begin at a page
 | |
| boundary and all writes are an integer number of pages in size.  Reads
 | |
| are also usually an integer number of pages in size, with the one exception
 | |
| that when the database is first opened, the first 100 bytes of the
 | |
| database file (the database file header) are read as a sub-page size unit.</p>
 | |
| 
 | |
| <p>Before any information-bearing page of the database is modified, 
 | |
| the original unmodified content of that page is written into the 
 | |
| rollback journal.  If a transaction is interrupted and needs to be 
 | |
| rolled back, the rollback journal can then be used to restore the
 | |
| database to its original state.  Freelist leaf pages bear no
 | |
| information that would need to be restored on a rollback and so they
 | |
| are not written to the journal prior to modification, in order to
 | |
| reduce disk I/O.</p>
 | |
| 
 | |
| <a name="database_header"></a>
 | |
| 
 | |
| <h2 id="the_database_header"><span>1.3. </span>The Database Header</h2>
 | |
| 
 | |
| <p>The first 100 bytes of the database file comprise the database file 
 | |
| header.  The database file header is divided into fields as shown by
 | |
| the table below.  All multibyte fields in the database file header are
 | |
| stored with the most significant byte first (big-endian).</p>
 | |
| 
 | |
| <center>
 | |
| <i>Database Header Format</i><br>
 | |
| <table width="80%" border="1">
 | |
| <tr><th>Offset</th><th>Size</th><th>Description
 | |
| </th></tr><tr><td valign="top" align="center">0</td><td valign="top" align="center">16</td><td align="left">
 | |
| The header string: "SQLite format 3\000"
 | |
| </td></tr><tr><td valign="top" align="center">16</td><td valign="top" align="center">2</td><td align="left">
 | |
| The database page size in bytes.  Must be a power of two between 512
 | |
| and 32768 inclusive, or the value 1 representing a page size of 65536.
 | |
| </td></tr><tr><td valign="top" align="center">18</td><td valign="top" align="center">1</td><td align="left">
 | |
| File format write version.  1 for legacy; 2 for <a href="wal.html">WAL</a>.
 | |
| </td></tr><tr><td valign="top" align="center">19</td><td valign="top" align="center">1</td><td align="left">
 | |
| File format read version.  1 for legacy; 2 for <a href="wal.html">WAL</a>.
 | |
| </td></tr><tr><td valign="top" align="center">20</td><td valign="top" align="center">1</td><td align="left">
 | |
| Bytes of unused "reserved" space at the end of each page.  Usually 0.
 | |
| </td></tr><tr><td valign="top" align="center">21</td><td valign="top" align="center">1</td><td align="left">
 | |
| Maximum embedded payload fraction.  Must be 64.
 | |
| </td></tr><tr><td valign="top" align="center">22</td><td valign="top" align="center">1</td><td align="left">
 | |
| Minimum embedded payload fraction.  Must be 32.
 | |
| </td></tr><tr><td valign="top" align="center">23</td><td valign="top" align="center">1</td><td align="left">
 | |
| Leaf payload fraction.  Must be 32.
 | |
| </td></tr><tr><td valign="top" align="center">24</td><td valign="top" align="center">4</td><td align="left">
 | |
| File change counter.
 | |
| </td></tr><tr><td valign="top" align="center">28</td><td valign="top" align="center">4</td><td align="left">
 | |
| Size of the database file in pages.  The "in-header database size".
 | |
| </td></tr><tr><td valign="top" align="center">32</td><td valign="top" align="center">4</td><td align="left">
 | |
| Page number of the first freelist trunk page.
 | |
| </td></tr><tr><td valign="top" align="center">36</td><td valign="top" align="center">4</td><td align="left">
 | |
| Total number of freelist pages.
 | |
| </td></tr><tr><td valign="top" align="center">40</td><td valign="top" align="center">4</td><td align="left">
 | |
| The schema cookie.
 | |
| </td></tr><tr><td valign="top" align="center">44</td><td valign="top" align="center">4</td><td align="left">
 | |
| The schema format number.  Supported schema formats are 1, 2, 3, and 4.
 | |
| </td></tr><tr><td valign="top" align="center">48</td><td valign="top" align="center">4</td><td align="left">
 | |
| Default page cache size.
 | |
| </td></tr><tr><td valign="top" align="center">52</td><td valign="top" align="center">4</td><td align="left">
 | |
| The page number of the largest root b-tree page when in auto-vacuum or
 | |
| incremental-vacuum modes, or zero otherwise.
 | |
| </td></tr><tr><td valign="top" align="center">56</td><td valign="top" align="center">4</td><td align="left">
 | |
| The database text encoding.  A value of 1 means UTF-8.  A value of 2
 | |
| means UTF-16le.  A value of 3 means UTF-16be.
 | |
| </td></tr><tr><td valign="top" align="center">60</td><td valign="top" align="center">4</td><td align="left">
 | |
| The "user version" as read and set by the <a href="pragma.html#pragma_user_version">user_version pragma</a>.
 | |
| </td></tr><tr><td valign="top" align="center">64</td><td valign="top" align="center">4</td><td align="left">
 | |
| True (non-zero) for incremental-vacuum mode.  False (zero) otherwise.
 | |
| </td></tr><tr><td valign="top" align="center">68</td><td valign="top" align="center">4</td><td align="left">
 | |
| The "Application ID" set by <a href="pragma.html#pragma_application_id">PRAGMA application_id</a>.
 | |
| </td></tr><tr><td valign="top" align="center">72</td><td valign="top" align="center">20</td><td align="left">
 | |
| Reserved for expansion.  Must be zero.
 | |
| </td></tr><tr><td valign="top" align="center">92</td><td valign="top" align="center">4</td><td align="left">
 | |
| The <a href="fileformat2.html#validfor">version-valid-for number</a>.
 | |
| </td></tr><tr><td valign="top" align="center">96</td><td valign="top" align="center">4</td><td align="left">
 | |
| <a href="c3ref/c_source_id.html">SQLITE_VERSION_NUMBER</a>
 | |
| </td></tr></table></center>
 | |
| 
 | |
| <h3 id="magic_header_string"><span>1.3.1. </span>Magic Header String</h3>
 | |
| 
 | |
| <p>Every valid SQLite database file begins with the following 16 bytes 
 | |
| (in hex): 53 51 4c 69 74 65 20 66 6f 72 6d 61 74 20 33 00.  This byte sequence
 | |
| corresponds to the UTF-8 string "SQLite format 3" including the nul
 | |
| terminator character at the end.</p>
 | |
| 
 | |
| <h3 id="page_size"><span>1.3.2. </span>Page Size</h3>
 | |
| 
 | |
| <p>The two-byte value beginning at offset 16 determines the page size of 
 | |
| the database.  For SQLite versions 3.7.0.1 (2010-08-04)
 | |
| and earlier, this value is 
 | |
| interpreted as a big-endian integer and must be a power of two between
 | |
| 512 and 32768, inclusive.  Beginning with SQLite <a href="releaselog/3_7_1.html">version 3.7.1</a>
 | |
| (2010-08-23), a page
 | |
| size of 65536 bytes is supported.  The value 65536 will not fit in a
 | |
| two-byte integer, so to specify a 65536-byte page size, the value
 | |
| at offset 16 is 0x00 0x01.
 | |
| This value can be interpreted as a big-endian
 | |
| 1 and thought of as a magic number to represent the 65536 page size.
 | |
| Or one can view the two-byte field as a little endian number and say
 | |
| that it represents the page size divided by 256.  These two 
 | |
| interpretations of the page-size field are equivalent.</p>
 | |
| 
 | |
| <a name="vnums"></a>
 | |
| 
 | |
| <h3 id="file_format_version_numbers"><span>1.3.3. </span>File format version numbers</h3>
 | |
| 
 | |
| <p>The file format write version and file format read version at offsets
 | |
| 18 and 19 are intended to allow for enhancements of the file format
 | |
| in future versions of SQLite.  In current versions of SQLite, both of
 | |
| these values are 1 for rollback journalling modes and 2 for <a href="wal.html">WAL</a>
 | |
| journalling mode.  If a version of SQLite coded to the current
 | |
| file format specification encounters a database file where the read
 | |
| version is 1 or 2 but the write version is greater than 2, then the database
 | |
| file must be treated as read-only.  If a database file with a read version
 | |
| greater than 2 is encountered, then that database cannot be read or written.</p>
 | |
| 
 | |
| <a name="resbyte"></a>
 | |
| 
 | |
| <h3 id="reserved_bytes_per_page"><span>1.3.4. </span>Reserved bytes per page</h3>
 | |
| 
 | |
| <p>SQLite has the ability to set aside a small number of extra bytes at
 | |
| the end of every page for use by extensions.  These extra bytes are
 | |
| used, for example, by the SQLite Encryption Extension to store a nonce
 | |
| and/or cryptographic checksum associated with each page.  The 
 | |
| "reserved space" size in the 1-byte integer at offset 20 is the number
 | |
| of bytes of space at the end of each page to reserve for extensions.
 | |
| This value is usually 0.  The value can be odd.</p>
 | |
| 
 | |
| <a name="usable_size"></a>
 | |
| 
 | |
| <p>The "usable size" of a database page is the page size specified by the
 | |
| 2-byte integer at offset 16 in the header less the "reserved" space size
 | |
| recorded in the 1-byte integer at offset 20 in the header.  The usable
 | |
| size of a page might be an odd number.  However, the usable size is not
 | |
| allowed to be less than 480.  In other words, if the page size is 512,
 | |
| then the reserved space size cannot exceed 32.</p>
 | |
| 
 | |
| <h3 id="payload_fractions"><span>1.3.5. </span>Payload fractions</h3>
 | |
| 
 | |
| <p>The maximum and minimum embedded payload fractions and the leaf
 | |
| payload fraction values must be 64, 32, and 32.  These values were
 | |
| originally intended to be tunable parameters that could be used to
 | |
| modify the storage format of the b-tree algorithm.  However, that
 | |
| functionality is not supported and there are no current plans to add
 | |
| support in the future.  Hence, these three bytes are fixed at the
 | |
| values specified.</p>
 | |
| 
 | |
| <h3 id="file_change_counter"><span>1.3.6. </span>File change counter</h3>
 | |
| 
 | |
| <a name="chngctr"></a>
 | |
| 
 | |
| <p>The file change counter is a 4-byte big-endian integer at
 | |
| offset 24 that is incremented whenever the database file is unlocked
 | |
| after having been modified.
 | |
| When two or more processes are reading the same database file, each 
 | |
| process can detect database changes from other processes by monitoring 
 | |
| the change counter.
 | |
| A process will normally want to flush its database page cache when
 | |
| another process modified the database, since the cache has become stale.
 | |
| The file change counter facilitates this.</p>
 | |
| 
 | |
| <p>In WAL mode, changes to the database are detected using the wal-index
 | |
| and so the change counter is not needed.  Hence, the change counter might
 | |
| not be incremented on each transaction in WAL mode.</p>
 | |
| 
 | |
| <h3 id="in_header_database_size"><span>1.3.7. </span>In-header database size</h3>
 | |
| 
 | |
| <a name="filesize"></a>
 | |
| 
 | |
| <p>The 4-byte big-endian integer at offset 28 into the header 
 | |
| stores the size of the database file in pages.  If this in-header
 | |
| datasize size is not valid (see the next paragraph), then the database 
 | |
| size is computed by looking
 | |
| at the actual size of the database file. Older versions of SQLite
 | |
| ignored the in-header database size and used the actual file size
 | |
| exclusively.  Newer versions of SQLite use the in-header database
 | |
| size if it is available but fall back to the actual file size if
 | |
| the in-header database size is not valid.</p>
 | |
| 
 | |
| <p>The in-header database size is only considered to be valid if
 | |
| it is non-zero and if the 4-byte <a href="fileformat2.html#chngctr">change counter</a> at offset 24
 | |
| exactly matches the 4-byte <a href="fileformat2.html#validfor">version-valid-for number</a> at offset 92.
 | |
| The in-header database size is always valid 
 | |
| when the database is only modified using recent versions of SQLite,
 | |
| versions 3.7.0 (2010-07-21) and later.
 | |
| If a legacy version of SQLite writes to the database, it will not
 | |
| know to update the in-header database size and so the in-header
 | |
| database size could be incorrect.  But legacy versions of SQLite
 | |
| will also leave the version-valid-for number at offset 92 unchanged
 | |
| so it will not match the change-counter.  Hence, invalid in-header
 | |
| database sizes can be detected (and ignored) by observing when
 | |
| the change-counter does not match the version-valid-for number.</p>
 | |
| 
 | |
| <h3 id="free_page_list"><span>1.3.8. </span>Free page list</h3>
 | |
| 
 | |
| <p>Unused pages in the database file are stored on a freelist.  The
 | |
| 4-byte big-endian integer at offset 32 stores the page number of
 | |
| the first page of the freelist, or zero if the freelist is empty.
 | |
| The 4-byte big-endian integer at offset 36 stores stores the total 
 | |
| number of pages on the freelist.</p>
 | |
| 
 | |
| <h3 id="schema_cookie"><span>1.3.9. </span>Schema cookie</h3>
 | |
| 
 | |
| <p>The schema cookie is a 4-byte big-endian integer at offset 40
 | |
| that is incremented whenever the database schema changes.  A 
 | |
| prepared statement is compiled against a specific version of the
 | |
| database schema.  When the database schema changes, the statement
 | |
| must be reprepared.  When a prepared statement runs, it first checks
 | |
| the schema cookie to ensure the value is the same as when the statement
 | |
| was prepared and if the schema cookie has changed, the statement either
 | |
| automatically reprepares and reruns or it aborts with an <a href="rescode.html#schema">SQLITE_SCHEMA</a> 
 | |
| error.</p>
 | |
| 
 | |
| <a name="schemaformat"></a>
 | |
| 
 | |
| <h3 id="schema_format_number"><span>1.3.10. </span>Schema format number</h3>
 | |
| 
 | |
| <p>The schema format number is a 4-byte big-endian integer at offset 44.
 | |
| The schema format number is similar to the file format read and write
 | |
| version numbers at offsets 18 and 19 except that the schema format number
 | |
| refers to the high-level SQL formatting rather than the low-level b-tree
 | |
| formatting.  Four schema format numbers are currently defined:</p>
 | |
| 
 | |
| <ol>
 | |
| <li value="1">Format 1 is understood by all versions of SQLite back to
 | |
| <a href="releaselog/3_0_0.html">version 3.0.0</a> (2004-06-18).</li>
 | |
| <li value="2">Format 2 adds the ability of rows within the same table
 | |
| to have a varying number of columns, in order to support the
 | |
| <a href="lang_altertable.html">ALTER TABLE ... ADD COLUMN</a> functionality.  Support for
 | |
| reading and writing format 2 was added in SQLite 
 | |
| <a href="releaselog/3_1_3.html">version 3.1.3</a> on 2005-02-20.</li>
 | |
| <li value="3">Format 3 adds the ability of extra columns added by
 | |
| <a href="lang_altertable.html">ALTER TABLE ... ADD COLUMN</a> to have non-NULL default
 | |
| values.  This capability was added in SQLite <a href="releaselog/3_1_4.html">version 3.1.4</a>
 | |
| on 2005-03-11.</li>
 | |
| <li value="4">Format 4 causes SQLite to respect the
 | |
| <a href="lang_createindex.html#descidx">DESC keyword</a> on
 | |
| index declarations.  (The DESC keyword is ignored in indexes for 
 | |
| formats 1, 2, and 3.)
 | |
| Format 4 also adds two new boolean record type values (<a href="fileformat2.html#serialtype">serial types</a>
 | |
| 8 and 9).  Support for format 4 was added in SQLite 3.3.0 on
 | |
| 2006-01-10.</li>
 | |
| </ol>
 | |
| 
 | |
| <p>New database files created by SQLite use format 4 by default.
 | |
| The <a href="pragma.html#pragma_legacy_file_format">legacy_file_format pragma</a> can be used to cause SQLite
 | |
| to create new database files using format 1.
 | |
| The format version number can be made to default to 1 instead of 4 by
 | |
| setting <a href="compile.html#default_file_format">SQLITE_DEFAULT_FILE_FORMAT</a>=1 at compile-time.
 | |
| </p>
 | |
| 
 | |
| <h3 id="suggested_cache_size"><span>1.3.11. </span>Suggested cache size</h3>
 | |
| 
 | |
| <p>The 4-byte big-endian signed integer at offset 48 is the suggested
 | |
| cache size in pages for the database file.  The value is a suggestion
 | |
| only and SQLite is under no obligation to honor it.  The absolute value
 | |
| of the integer is used as the suggested size.  The suggested cache size
 | |
| can be set using the <a href="pragma.html#pragma_default_cache_size">default_cache_size pragma</a>.</p>
 | |
| 
 | |
| <h3 id="incremental_vacuum_settings"><span>1.3.12. </span>Incremental vacuum settings</h3>
 | |
| 
 | |
| <p>The two 4-byte big-endian integers at offsets 52 and 64 are used
 | |
| to manage the <a href="pragma.html#pragma_auto_vacuum">auto_vacuum</a> and <a href="pragma.html#pragma_incremental_vacuum">incremental_vacuum</a> modes.  If
 | |
| the integer at offset 52 is zero then pointer-map (ptrmap) pages are
 | |
| omitted from the database file and neither auto_vacuum nor
 | |
| incremental_vacuum are supported.  If the integer at offset 52 is
 | |
| non-zero then it is the page number of the largest root page in the
 | |
| database file, the database file will contain ptrmap pages, and the
 | |
| mode must be either auto_vacuum or incremental_vacuum.  In this latter
 | |
| case, the integer at offset 64 is true for incremental_vacuum and
 | |
| false for auto_vacuum.  If the integer at offset 52 is zero then
 | |
| the integer at offset 64 must also be zero.</p>
 | |
| 
 | |
| <a name="enc"></a>
 | |
| 
 | |
| <h3 id="text_encoding"><span>1.3.13. </span>Text encoding</h3>
 | |
| 
 | |
| <p>The 4-byte big-endian integer at offset 56 determines the encoding
 | |
| used for all text strings stored in the database.  
 | |
| A value of 1 means UTF-8.
 | |
| A value of 2 means UTF-16le.
 | |
| A value of 3 means UTF-16be.
 | |
| No other values are allowed.
 | |
| The sqlite3.h header file defines C-preprocessor macros SQLITE_UTF8 as 1,
 | |
| SQLITE_UTF16LE as 2, and SQLITE_UTF16BE as 3, to use in place of
 | |
| the numeric codes for the text encoding.</p>
 | |
| 
 | |
| <h3 id="user_version_number"><span>1.3.14. </span>User version number</h3>
 | |
| 
 | |
| <p>The 4-byte big-endian integer at offset 60 is the user version which
 | |
| is set and queried by the <a href="pragma.html#pragma_user_version">user_version pragma</a>.  The user version is
 | |
| not used by SQLite.</p>
 | |
| 
 | |
| <a name="appid"></a>
 | |
| 
 | |
| <h3 id="application_id"><span>1.3.15. </span>Application ID</h3>
 | |
| 
 | |
| <p>The 4-byte big-endian integer at offset 68 is an "Application ID" that
 | |
| can be set by the <a href="pragma.html#pragma_application_id">PRAGMA application_id</a> command in order to identify the
 | |
| database as belonging to or associated with a particular application.
 | |
| The application ID is intended for database files used as an
 | |
| <a href="appfileformat.html">application file-format</a>.  The application ID can be used by utilities 
 | |
| such as <a href="http://www.darwinsys.com/file/">file(1)</a> to determine the specific
 | |
| file type rather than just reporting "SQLite3 Database".  A list of
 | |
| assigned application IDs can be seen by consulting the
 | |
| <a href="http://www.sqlite.org/src/artifact?ci=trunk&filename=magic.txt">magic.txt</a>
 | |
| file in the SQLite source repository.</p>
 | |
| 
 | |
| <a name="validfor"></a>
 | |
| 
 | |
| <h3 id="write_library_version_number_and_version_valid_for_number"><span>1.3.16. </span>Write library version number and version-valid-for number</h3>
 | |
| 
 | |
| <p>The 4-byte big-endian integer at offset 96 stores the 
 | |
| <a href="c3ref/c_source_id.html">SQLITE_VERSION_NUMBER</a> value for the SQLite library that most
 | |
| recently modified the database file.  The 4-byte big-endian integer at
 | |
| offset 92 is the value of the <a href="fileformat2.html#chngctr">change counter</a> when the version number
 | |
| was stored.  The integer at offset 92 indicates which transaction
 | |
| the version number is valid for and is sometimes called the
 | |
| "version-valid-for number".
 | |
| 
 | |
| </p><h3 id="header_space_reserved_for_expansion"><span>1.3.17. </span>Header space reserved for expansion</h3>
 | |
| 
 | |
| <p>All other bytes of the database file header are reserved for
 | |
| future expansion and must be set to zero.</p>
 | |
| 
 | |
| <a name="lockbyte"></a>
 | |
| 
 | |
| <h2 id="the_lock_byte_page"><span>1.4. </span>The Lock-Byte Page</h2>
 | |
| 
 | |
| <p>The lock-byte page is the single page of the database file
 | |
| that contains the bytes at offsets between 1073741824 and 1073742335,
 | |
| inclusive.  A database file that is less than or equal to 1073741824 bytes 
 | |
| in size contains no lock-byte page.  A database file larger than
 | |
| 1073741824 contains exactly one lock-byte page.
 | |
| </p>
 | |
| 
 | |
| <p>The lock-byte page is set aside for use by the operating-system specific
 | |
| <a href="vfs.html">VFS</a> implementation in implementing the database file locking primitives.
 | |
| SQLite does not use the lock-byte page.  The SQLite core 
 | |
| will never read or write the lock-byte page,
 | |
| though operating-system specific <a href="vfs.html">VFS</a> 
 | |
| implementations may choose to read or write bytes on the lock-byte 
 | |
| page according to the 
 | |
| needs and proclivities of the underlying system.  The unix and win32
 | |
| <a href="vfs.html">VFS</a> implementations that come built into SQLite do not write to the
 | |
| lock-byte page, but third-party VFS implementations for
 | |
| other operating systems might.</p>
 | |
| 
 | |
| <p>The lock-byte page arose from the need to support Win95 which was the
 | |
| predominant operating system when this file format was designed and which 
 | |
| only supported mandatory file locking.  All modern operating systems that
 | |
| we know of support advisory file locking, and so the lock-byte page is
 | |
| not really needed any more, but is retained for backwards compatibility.</p>
 | |
| 
 | |
| <a name="freelist"></a>
 | |
| 
 | |
| <h2 id="the_freelist"><span>1.5. </span>The Freelist</h2>
 | |
| 
 | |
| <p>A database file might contain one or more pages that are not in
 | |
| active use.  Unused pages can come about, for example, when information
 | |
| is deleted from the database.  Unused pages are stored on the freelist
 | |
| and are reused when additional pages are required.</p>
 | |
| 
 | |
| <p>The freelist is organized as a linked list of freelist trunk pages
 | |
| with each trunk page containing page numbers for zero or more freelist
 | |
| leaf pages.</p>
 | |
| 
 | |
| <p>A freelist trunk page consists of an array of 4-byte big-endian integers.
 | |
| The size of the array is as many integers as will fit in the usable space
 | |
| of a page.  The minimum usable space is 480 bytes so the array will always
 | |
| be at least 120 entries in length.  The first integer on a freelist trunk
 | |
| page is the page number of the next freelist trunk page in the list or zero 
 | |
| if this is the last freelist trunk page.  The second integer on a freelist
 | |
| trunk page is the number of leaf page pointers to follow.  
 | |
| Call the second integer on a freelist trunk page L.
 | |
| If L is greater than zero then integers with array indexes between 2 and
 | |
| L+1 inclusive contain page numbers for freelist leaf pages.</p>
 | |
| 
 | |
| <p>Freelist leaf pages contain no information.  SQLite avoids reading or
 | |
| writing freelist leaf pages in order to reduce disk I/O.</p>
 | |
| 
 | |
| <p>A bug in SQLite versions prior to 3.6.0 (2008-07-16)
 | |
| caused the database to be
 | |
| reported as corrupt if any of the last 6 entries in the freelist trunk page 
 | |
| array contained non-zero values.  Newer versions of SQLite do not have
 | |
| this problem.  However, newer versions of SQLite still avoid using the 
 | |
| last six entries in the freelist trunk page array in order that database
 | |
| files created by newer versions of SQLite can be read by older versions
 | |
| of SQLite.</p>
 | |
| 
 | |
| <p>The number of freelist pages is stored as a 4-byte big-endian integer
 | |
| in the database header at an offset of 36 from the beginning of the file.
 | |
| The database header also stores the page number of the first freelist trunk
 | |
| page as a 4-byte big-endian integer at an offset of 32 from the beginning
 | |
| of the file.</p>
 | |
| 
 | |
| <a name="btree"></a>
 | |
| 
 | |
| <h2 id="b_tree_pages"><span>1.6. </span>B-tree Pages</h2>
 | |
| 
 | |
| <p>The b-tree algorithm provides key/data storage with unique and
 | |
| ordered keys on page-oriented storage devices.
 | |
| For background information on b-trees, see
 | |
| Knuth, <u>The Art Of Computer Programming</u>, Volume 3 "Sorting
 | |
| and Searching", pages 471-479.  Two variants of b-trees are used by
 | |
| SQLite.  "Table b-trees" use a 64-bit signed integer key and store
 | |
| all data in the leaves.  "Index b-trees" use arbitrary keys and store no
 | |
| data at all.
 | |
| 
 | |
| </p><p>A b-tree page is either an interior page or a leaf page.
 | |
| A leaf page contains keys and in the case of a table b-tree each
 | |
| key has associated data.  An interior page contains
 | |
| K keys together with K+1 pointers to child b-tree pages.
 | |
| A "pointer" in an interior b-tree page is just the 31-bit integer
 | |
| page number of the child page.</p>
 | |
| 
 | |
| <p>Define the depth
 | |
| of a leaf b-tree to be 1 and the depth of any interior b-tree to be one
 | |
| more than the maximum depth of any of its children.  In a well-formed
 | |
| database, all children of an interior b-tree have the same depth.</p>
 | |
| 
 | |
| <p>In an interior b-tree page, the pointers and keys logically alternate 
 | |
| with a pointer on both ends. (The previous sentence is to be understood
 | |
| conceptually - the actual layout of the keys and
 | |
| pointers within the page is more complicated and will be described in
 | |
| the sequel.)  All keys within the same page are unique and are logically
 | |
| organized in ascending order from left to right.  (Again, this ordering
 | |
| is logical, not physical.  The actual location of keys within the page
 | |
| is arbitrary.) For any key X, pointers to the left
 | |
| of a X refer to b-tree pages on which all keys are less than or equal to X.
 | |
| Pointers to the right of X refer to pages where all keys are 
 | |
| greater than X.</p>
 | |
| 
 | |
| <p>Within an interior b-tree page, each key and the pointer to its
 | |
| immediate left are combined into a structure called a "cell".  The
 | |
| right-most pointer is held separately.  A leaf b-tree page has no
 | |
| pointers, but it still uses the cell structure to hold keys for
 | |
| index b-trees or keys and content for table b-trees.  Data is also
 | |
| contained in the cell.
 | |
| </p>
 | |
| 
 | |
| <p>Every b-tree page has at most one parent b-tree page.
 | |
| A b-tree page without a parent is called a root page.  A root b-tree page
 | |
| together with the closure of its children form a complete b-tree.
 | |
| It is possible (and in fact rather common) to have a complete b-tree
 | |
| that consists of a single page that is both a leaf and the root.
 | |
| Because there are pointers from parents to children, every page of a
 | |
| complete b-tree can be located if only the root page is known.  Hence,
 | |
| b-trees are identified by their root page number.</p>
 | |
| 
 | |
| <a name="btypes"></a>
 | |
| 
 | |
| <p>A b-tree page is either a table b-tree page or an index b-tree page.
 | |
| All pages within each complete b-tree are of the same type: either table
 | |
| or index.  There is one table b-trees in the database file
 | |
| for each rowid table in the database schema, including system tables
 | |
| such as <a href="schematab.html">sqlite_schema</a>.  There is one index b-tree
 | |
| in the database file for each index in the schema, including implied indexes
 | |
| created by uniqueness constraints.  There are no b-trees associated with
 | |
| <a href="vtab.html">virtual tables</a>.  Specific virtual table implementations might make use
 | |
| of <a href="vtab.html#xshadowname">shadow tables</a> for storage, but those shadow tables will have separate
 | |
| entries in the database schema.  <a href="withoutrowid.html">WITHOUT ROWID</a> tables use index b-trees
 | |
| rather than a table b-trees, so there is one
 | |
| index b-tree in the database file for each <a href="withoutrowid.html">WITHOUT ROWID</a> table.
 | |
| The b-tree corresponding to the sqlite_schema table is always a table
 | |
| b-tree and always has a root page of 1.
 | |
| The sqlite_schema table contains the root page number for every other 
 | |
| table and index in the database file.</p>
 | |
| 
 | |
| <p>Each entry in a table b-tree consists of a 64-bit signed integer key
 | |
| and up to 2147483647 bytes of arbitrary data.  (The key of a table b-tree
 | |
| corresponds to the <a href="lang_createtable.html#rowid">rowid</a> of the SQL table that the b-tree implements.)
 | |
| Interior table b-trees hold only keys and pointers to children.
 | |
| All data is contained in the table b-tree leaves.</p>
 | |
| 
 | |
| <p>Each entry in an index b-tree consists of an arbitrary key of up
 | |
| to 2147483647 bytes in length and no data.</p>
 | |
| 
 | |
| <a name="cell_payload"></a>
 | |
| 
 | |
| <p>Define the "payload" of a cell to be the arbitrary length section
 | |
| of the cell.  For an index b-tree, the key is always arbitrary in length
 | |
| and hence the payload is the key.  There are no arbitrary length elements
 | |
| in the cells of interior table b-tree pages and so those cells have no
 | |
| payload.  Table b-tree leaf pages contain arbitrary length content and
 | |
| so for cells on those pages the payload is the content.
 | |
| </p><p>When the size of payload for a cell exceeds a certain threshold (to
 | |
| be defined later) then only the first few bytes of the payload
 | |
| are stored on the b-tree page and the balance is stored in a linked list
 | |
| of content overflow pages.</p>
 | |
| 
 | |
| <p>A b-tree page is divided into regions in the following order:
 | |
| 
 | |
| </p><ol>
 | |
| <li>The 100-byte database file header (found on page 1 only)
 | |
| </li><li>The 8 or 12 byte b-tree page header
 | |
| </li><li>The cell pointer array
 | |
| </li><li>Unallocated space
 | |
| </li><li>The cell content area
 | |
| </li><li>The reserved region.
 | |
| </li></ol>
 | |
| 
 | |
| 
 | |
| <p>The 100-byte database file header is found only on page 1, which is
 | |
| always a table b-tree page.  All other b-tree pages in the database file
 | |
| omit this 100-byte header.</p>
 | |
| 
 | |
| <p>The reserved region is an area of unused space at the end of every
 | |
| page (except the locking page) that extensions can use to hold per-page
 | |
| information.  The size of the reserved region is determined by the one-byte
 | |
| unsigned integer found at an offset of 20 into the database file header.
 | |
| The size of the reserved region is usually zero.</p>
 | |
| 
 | |
| <p>The b-tree page header is 8 bytes in size for leaf pages and 12
 | |
| bytes for interior pages.  All multibyte values in the page header
 | |
| are big-endian.
 | |
| The b-tree page header is composed of the following fields:</p>
 | |
| 
 | |
| <center>
 | |
| <i>B-tree Page Header Format</i><br>
 | |
| <table border="1" width="80%">
 | |
| <tr><th>Offset</th><th>Size</th><th>Description
 | |
| </th></tr><tr><td align="center" valign="top">0</td><td align="center" valign="top">1</td><td align="left">
 | |
| The one-byte flag at offset 0 indicating the b-tree page type.<ul>
 | |
| <li>A value of 2 (0x02) means the page is an interior index b-tree page.
 | |
| </li><li>A value of 5 (0x05) means the page is an interior table b-tree page.
 | |
| </li><li>A value of 10 (0x0a) means the page is a leaf index b-tree page.
 | |
| </li><li>A value of 13 (0x0d) means the page is a leaf table b-tree page.</li></ul>
 | |
| Any other value for the b-tree page type is an error.
 | |
| </td></tr><tr><td align="center" valign="top">1</td><td align="center" valign="top">2</td><td align="left">
 | |
| The two-byte integer at offset 1 gives the start of the
 | |
| first freeblock on the page, or is zero if there are no freeblocks.
 | |
| </td></tr><tr><td align="center" valign="top">3</td><td align="center" valign="top">2</td><td align="left">
 | |
| The two-byte integer at offset 3 gives the number of cells on the page.
 | |
| </td></tr><tr><td align="center" valign="top">5</td><td align="center" valign="top">2</td><td align="left">
 | |
| The two-byte integer at offset 5 designates the start of the cell content
 | |
| area.  A zero value for this integer is interpreted as 65536.
 | |
| </td></tr><tr><td align="center" valign="top">7</td><td align="center" valign="top">1</td><td align="left">
 | |
| The one-byte integer at offset 7 gives the number of fragmented free
 | |
| bytes within the cell content area.
 | |
| </td></tr><tr><td align="center" valign="top">8</td><td align="center" valign="top">4</td><td align="left">
 | |
| The four-byte page number at offset 8 is the right-most pointer.  This
 | |
| value appears in the header of interior b-tree pages only and is omitted from
 | |
| all other pages.
 | |
| </td></tr></table></center>
 | |
| 
 | |
| <p>The cell pointer array of a b-tree page immediately follows the b-tree
 | |
| page header.  Let K be the number of cells on the btree.  The cell pointer
 | |
| array consists of K 2-byte integer offsets to the cell contents.  The
 | |
| cell pointers are arranged in key order with left-most cell (the cell with the
 | |
| smallest key) first and the right-most cell (the cell with the largest
 | |
| key) last.</p>
 | |
| 
 | |
| <p>Cell content is stored in the cell content region of the b-tree page.
 | |
| SQLite strives to place cells as far toward the end of the b-tree page as
 | |
| it can, in order to leave space for future growth of the cell pointer array.
 | |
| The area in between the last cell pointer array entry and the beginning of
 | |
| the first cell is the unallocated region.
 | |
| </p>
 | |
| 
 | |
| <p>If a page contains no cells (which is only possible for a root page
 | |
| of a table that contains no rows) then the offset to the
 | |
| cell content area will equal the page size minus the bytes of reserved space.
 | |
| If the database uses a 65536-byte page size and the reserved space is zero
 | |
| (the usual value for reserved space) then the cell content offset of an
 | |
| empty page wants to be 65536.  
 | |
| However, that integer is too large to be stored in a
 | |
| 2-byte unsigned integer, so a value of 0 is used in its place.
 | |
| 
 | |
| </p><p>A freeblock is a structure used to identify unallocated space within
 | |
| a b-tree page.  Freeblocks are organized as a chain.  The first 2 bytes of
 | |
| a freeblock are a big-endian integer which is the offset in the b-tree page
 | |
| of the next freeblock in the chain, or zero if the freeblock is the last on
 | |
| the chain.  The third and fourth bytes of each freeblock form
 | |
| a big-endian integer which is the size of the freeblock in bytes, including
 | |
| the 4-byte header.  Freeblocks are always connected in order 
 | |
| of increasing offset.  The second field of the b-tree page header is the
 | |
| offset of the first freeblock, or zero if there are no freeblocks on the
 | |
| page.  In a well-formed b-tree page, there will always be at least one cell
 | |
| before the first freeblock.</p>
 | |
| 
 | |
| <p>A freeblock requires at least 4 bytes of space.  If there is an isolated
 | |
| group of 1, 2, or 3 unused bytes within the cell content area, those bytes
 | |
| comprise a fragment.  The total number of bytes in all fragments is stored
 | |
| in the fifth field of the b-tree page header.  In a well-formed b-tree page,
 | |
| the total number of bytes in fragments may not exceed 60.</p>
 | |
| 
 | |
| <p>The total amount of free space on a b-tree page consists of the size
 | |
| of the unallocated region plus the total size of all freeblocks plus the
 | |
| number of fragmented free bytes.  SQLite may from time to time reorganize
 | |
| a b-tree page so that there are no freeblocks or fragment bytes, all
 | |
| unused bytes are contained in the unallocated space region, and all
 | |
| cells are packed tightly at the end of the page.  This is called 
 | |
| "defragmenting" the b-tree page.</p>
 | |
| 
 | |
| <a name="varint"></a>
 | |
| 
 | |
| 
 | |
| <p>A variable-length integer or "varint" is a static Huffman encoding
 | |
| of 64-bit twos-complement integers that uses less space for small positive 
 | |
| values. 
 | |
| A varint is between 1 and 9 bytes in length.  The varint consists of either
 | |
| zero or more bytes which have the high-order bit set followed by a single byte
 | |
| with the high-order bit clear, or nine bytes, whichever is shorter.
 | |
| The lower seven bits of each of the first eight bytes and all 8 bits of
 | |
| the ninth byte are used to reconstruct the 64-bit twos-complement integer.
 | |
| Varints are big-endian: bits taken from the earlier byte of the varint
 | |
| are more significant than bits taken from the later bytes. </p>
 | |
| 
 | |
| <p>The format of a cell depends on which kind of b-tree page the cell
 | |
| appears on.  The following table shows the elements of a cell, in
 | |
| order of appearance, for the various b-tree page types.
 | |
| 
 | |
| </p><dl>
 | |
| <dt><p>Table B-Tree Leaf Cell (header 0x0d):</p></dt>
 | |
| <dd><p></p><ul>
 | |
| <li>A varint which is the total number of bytes of payload, including any
 | |
| overflow
 | |
| </li><li>A varint which is the integer key, a.k.a. "<a href="lang_createtable.html#rowid">rowid</a>"
 | |
| </li><li>The initial portion of the payload that does not spill to overflow
 | |
| pages.
 | |
| </li><li>A 4-byte big-endian integer page number for the first page of the
 | |
| overflow page list - omitted if all payload fits on the b-tree page.
 | |
| </li></ul></dd>
 | |
| 
 | |
| <dt><p>Table B-Tree Interior Cell (header 0x05):</p></dt>
 | |
| <dd><p></p><ul>
 | |
| <li>A 4-byte big-endian page number which is the left child pointer.
 | |
| </li><li>A varint which is the integer key
 | |
| </li></ul></dd>
 | |
| 
 | |
| <dt><p>Index B-Tree Leaf Cell (header 0x0a):</p></dt>
 | |
| <dd><p></p><ul>
 | |
| <li>A varint which is the total number of bytes of key payload, including any
 | |
| overflow
 | |
| </li><li>The initial portion of the payload that does not spill to overflow
 | |
| pages.
 | |
| </li><li>A 4-byte big-endian integer page number for the first page of the
 | |
| overflow page list - omitted if all payload fits on the b-tree page.
 | |
| </li></ul></dd>
 | |
| 
 | |
| <dt><p>Index B-Tree Interior Cell (header 0x02):</p></dt>
 | |
| <dd><p></p><ul>
 | |
| <li>A 4-byte big-endian page number which is the left child pointer.
 | |
| </li><li>A varint which is the total number of bytes of key payload, including any
 | |
| overflow
 | |
| </li><li>The initial portion of the payload that does not spill to overflow
 | |
| pages.
 | |
| </li><li>A 4-byte big-endian integer page number for the first page of the
 | |
| overflow page list - omitted if all payload fits on the b-tree page.
 | |
| </li></ul></dd>
 | |
| </dl>
 | |
| 
 | |
| <p>The information above can be recast into a table format as follows:</p>
 | |
| 
 | |
| <a name="cellformat"></a>
 | |
| 
 | |
| <center>
 | |
| <i>B-tree Cell Format</i>
 | |
| <table border="1" width="80%">
 | |
| <tr><th rowspan="2">Datatype
 | |
|     </th><th colspan="4">Appears in...
 | |
|     </th><th rowspan="2">Description
 | |
| </th></tr><tr><th>Table Leaf (0x0d)
 | |
|     </th><th>Table Interior (0x05)
 | |
|     </th><th>Index Leaf (0x0a)
 | |
|     </th><th>Index Interior (0x02)
 | |
| </th></tr><tr><td align="center" valign="top">4-byte integer
 | |
|     </td><td align="center" valign="top"> 
 | |
|     </td><td align="center" valign="top">✔
 | |
|     </td><td align="center" valign="top"> 
 | |
|     </td><td align="center" valign="top">✔
 | |
|     </td><td align="left">Page number of left child
 | |
| </td></tr><tr><td align="center" valign="top">varint
 | |
|     </td><td align="center" valign="top">✔
 | |
|     </td><td align="center" valign="top"> 
 | |
|     </td><td align="center" valign="top">✔
 | |
|     </td><td align="center" valign="top">✔
 | |
|     </td><td align="left">Number of bytes of payload
 | |
| </td></tr><tr><td align="center" valign="top">varint
 | |
|     </td><td align="center" valign="top">✔
 | |
|     </td><td align="center" valign="top">✔
 | |
|     </td><td align="center" valign="top"> 
 | |
|     </td><td align="center" valign="top"> 
 | |
|     </td><td align="left">Rowid
 | |
| </td></tr><tr><td align="center" valign="top">byte array
 | |
|     </td><td align="center" valign="top">✔
 | |
|     </td><td align="center" valign="top"> 
 | |
|     </td><td align="center" valign="top">✔
 | |
|     </td><td align="center" valign="top">✔
 | |
|     </td><td align="left">Payload
 | |
| </td></tr><tr><td align="center" valign="top">4-byte integer
 | |
|     </td><td align="center" valign="top">✔
 | |
|     </td><td align="center" valign="top"> 
 | |
|     </td><td align="center" valign="top">✔
 | |
|     </td><td align="center" valign="top">✔
 | |
|     </td><td align="left">Page number of first overflow page
 | |
| </td></tr></table></center>
 | |
| 
 | |
| 
 | |
| <p>The amount of payload that spills onto overflow pages also depends on
 | |
| the page type.  For the following computations, let U be the usable size
 | |
| of a database page, the total page size less the reserved space at the
 | |
| end of each page.  And let P be the payload size.  In the following,
 | |
| symbol X represents the maximum amount of payload that can be stored directly
 | |
| on the b-tree page without spilling onto an overflow page and symbol M
 | |
| represents the minimum amount of payload that must be stored on the btree
 | |
| page before spilling is allowed.
 | |
| 
 | |
| </p><dl>
 | |
| <dt><p>Table B-Tree Leaf Cell:</p></dt>
 | |
| <dd><p>
 | |
| Let X be U-35.  If the payload size P is less than or equal to X then
 | |
| the entire payload is stored on the b-tree leaf page.
 | |
| Let M be ((U-12)*32/255)-23 and let K be M+((P-M)%(U-4)).
 | |
| If P is greater than X
 | |
| then the number of bytes stored on the table b-tree leaf page is K
 | |
| if K is less or equal to X or M otherwise.
 | |
| The number of bytes stored on the leaf page is never less than M.
 | |
| </p></dd>
 | |
| 
 | |
| <dt><p>Table B-Tree Interior Cell:</p></dt>
 | |
| <dd><p>
 | |
| Interior pages of table b-trees have no payload and so there is never
 | |
| any payload to spill.
 | |
| </p></dd>
 | |
| 
 | |
| <dt><p>Index B-Tree Leaf Or Interior Cell:</p></dt>
 | |
| <dd><p>
 | |
| Let X be ((U-12)*64/255)-23.  If the payload size P is less than
 | |
| or equal to X then the entire payload is stored on the b-tree page.
 | |
| Let M be ((U-12)*32/255)-23 and let K be M+((P-M)%(U-4)).
 | |
| If P is greater than X then the number
 | |
| of bytes stored on the index b-tree page is K if K is less than or
 | |
| equal to X or M otherwise.
 | |
| The number of bytes stored on the index page is never less than M.
 | |
| </p></dd>
 | |
| </dl>
 | |
| 
 | |
| <p>Here is an alternative description of the same computation:
 | |
| 
 | |
| </p><ul>
 | |
| <li>X is U-35 for table btree leaf pages or
 | |
|     ((U-12)*64/255)-23 for index pages.
 | |
| </li><li>M is always ((U-12)*32/255)-23.
 | |
| </li><li>Let K be M+((P-M)%(U-4)).
 | |
| </li><li>If P<=X then all P bytes of payload are stored directly on the 
 | |
|     btree page without overflow.
 | |
| </li><li>If P>X and K<=X then the first K bytes of P are stored on the 
 | |
|     btree page and the remaining P-K bytes are stored on overflow pages.
 | |
| </li><li>If P>X and K>X then the first M bytes of P are stored on the
 | |
|     btree page and the remaining P-M bytes are stored on overflow pages.
 | |
| </li></ul>
 | |
| 
 | |
| <p>The overflow thresholds are designed to give a minimum fanout of
 | |
| 4 for index b-trees and to make sure enough of the payload
 | |
| is on the b-tree page that the record header can usually be accessed
 | |
| without consulting an overflow page.  In hindsight, the designer of
 | |
| the SQLite b-tree logic realized that these thresholds could have been
 | |
| made much simpler.  However, the computations cannot be changed
 | |
| without resulting in an incompatible file format.  And the current computations
 | |
| work well, even if they are a little complex.</p>
 | |
| 
 | |
| <a name="ovflpgs"></a>
 | |
| 
 | |
| <h2 id="cell_payload_overflow_pages"><span>1.7. </span>Cell Payload Overflow Pages</h2>
 | |
| 
 | |
| <p>When the payload of a b-tree cell is too large for the b-tree page,
 | |
| the surplus is spilled onto overflow pages.  Overflow pages form a linked
 | |
| list.  The first four bytes of each overflow page are a big-endian
 | |
| integer which is the page number of the next page in the chain, or zero
 | |
| for the final page in the chain.  The fifth byte through the last usable
 | |
| byte are used to hold overflow content.</p>
 | |
| 
 | |
| <h2 id="pointer_map_or_ptrmap_pages"><span>1.8. </span>Pointer Map or Ptrmap Pages</h2>
 | |
| 
 | |
| <p>Pointer map or ptrmap pages are extra pages inserted into the database
 | |
| to make the operation of <a href="pragma.html#pragma_auto_vacuum">auto_vacuum</a> and <a href="pragma.html#pragma_incremental_vacuum">incremental_vacuum</a> modes
 | |
| more efficient.  Other page types in the database typically have pointers
 | |
| from parent to child.  For example, an interior b-tree page contains pointers
 | |
| to its child b-tree pages and an overflow chain has a pointer
 | |
| from earlier to later links in the chain.  A ptrmap page contains linkage
 | |
| information going in the opposite direction, from child to parent.</p>
 | |
| 
 | |
| <p>Ptrmap pages must exist in any database file which has a non-zero
 | |
| largest root b-tree page value at offset 52 in the database header.
 | |
| If the largest root b-tree page value is zero, then the database must not
 | |
| contain ptrmap pages.</p>
 | |
| 
 | |
| <p>In a database with ptrmap pages, the first ptrmap page is page 2.
 | |
| A ptrmap page consists of an array of 5-byte entries.  Let J be the
 | |
| number of 5-byte entries that will fit in the usable space of a page.
 | |
| (In other words, J=U/5.)  The first ptrmap page will contain back pointer
 | |
| information for pages 3 through J+2, inclusive.  The second pointer map
 | |
| page will be on page J+3 and that ptrmap page will provide back pointer
 | |
| information for pages J+4 through 2*J+3 inclusive.  And so forth for
 | |
| the entire database file.</p>
 | |
| 
 | |
| <p>In a database that uses ptrmap pages, all pages at locations identified
 | |
| by the computation in the previous paragraph must be ptrmap page and no
 | |
| other page may be a ptrmap page.  Except, if the byte-lock page happens to
 | |
| fall on the same page number as a ptrmap page, then the ptrmap is moved
 | |
| to the following page for that one case.</p>
 | |
| 
 | |
| <p>Each 5-byte entry on a ptrmap page provides back-link information about 
 | |
| one of the pages that immediately follow the pointer map.  If page B is a
 | |
| ptrmap page then back-link information about page B+1 is provided by
 | |
| the first entry on the pointer map.  Information about page B+2 is
 | |
| provided by the second entry.  And so forth.</p>
 | |
| 
 | |
| <p>Each 5-byte ptrmap entry consists of one byte of "page type" information
 | |
| followed by a 4-byte big-endian page number.  Five page types are recognized:
 | |
| </p>
 | |
| 
 | |
| <ol>
 | |
| <li>A b-tree root page.  The
 | |
| page number should be zero.
 | |
| </li><li>A freelist page.  The page number should be
 | |
| zero.
 | |
| </li><li>The first page of a
 | |
| cell payload overflow chain.  The page number is the b-tree page that
 | |
| contains the cell whose content has overflowed.
 | |
| </li><li>A page in an overflow chain
 | |
| other than the first page.  The page number is the prior page of the
 | |
| overflow chain.
 | |
| </li><li>A non-root b-tree page.  The
 | |
| page number is the parent b-tree page.
 | |
| </li></ol>
 | |
| 
 | |
| <p>In any database file that contains ptrmap pages, all b-tree root pages 
 | |
| must come before any non-root b-tree page, cell payload overflow page, or
 | |
| freelist page.  This restriction ensures that a root page will never
 | |
| be moved during an auto-vacuum or incremental-vacuum.  The auto-vacuum
 | |
| logic does not know how to update the root_page field of the sqlite_schema
 | |
| table and so it is necessary to prevent root pages from being moved
 | |
| during an auto-vacuum in order to preserve the integrity of the
 | |
| sqlite_schema table.  Root pages are moved to the beginning of the
 | |
| database file by the CREATE TABLE, CREATE INDEX, DROP TABLE, and
 | |
| DROP INDEX operations.</p>
 | |
| 
 | |
| <h1 id="schema_layer"><span>2. </span>Schema Layer</h1>
 | |
| 
 | |
| <p>The foregoing text describes low-level aspects of the SQLite file
 | |
| format.  The b-tree mechanism provides a powerful and efficient means of
 | |
| accessing a large data set.  This section will describe how the
 | |
| low-level b-tree layer is used to implement higher-level SQL
 | |
| capabilities.</p>
 | |
| 
 | |
| <a name="record_format"></a>
 | |
| 
 | |
| <h2 id="record_format"><span>2.1. </span>Record Format</h2>
 | |
| 
 | |
| <p>The data for a table b-tree leaf page and the key
 | |
| of an index b-tree page was characterized above
 | |
| as an arbitrary sequence of bytes.
 | |
| The prior discussion mentioned one key being less than another, but
 | |
| did not define what "less than" meant.  The current section will address
 | |
| these omissions.</p>
 | |
| 
 | |
| <p>Payload, either table b-tree data or index b-tree keys, 
 | |
| is always in the "record format".
 | |
| The record format defines a sequence of values corresponding
 | |
| to columns in a table or index.  The record format specifies the number
 | |
| of columns, the datatype of each column, and the content of each column.</p>
 | |
| 
 | |
| <p>The record format makes extensive use of the 
 | |
| <a href="fileformat2.html#varint">variable-length integer</a> or <a href="fileformat2.html#varint">varint</a>
 | |
| representation of 64-bit signed integers defined above.</p>
 | |
| 
 | |
| <a name="serialtype"></a>
 | |
| 
 | |
| <p>A record contains a header and a body, in that order.  
 | |
| The header begins with a single varint which determines the total number
 | |
| of bytes in the header.  The varint value is the size of the header in
 | |
| bytes including the size varint itself.  Following the size varint are
 | |
| one or more additional varints, one per column.  These additional varints
 | |
| are called "serial type" numbers and
 | |
| determine the datatype of each column, according to the following chart:</p>
 | |
| 
 | |
| <center>
 | |
| <i>Serial Type Codes Of The Record Format</i><br>
 | |
| <table width="80%" border="1">
 | |
| <tr><th>Serial Type</th><th>Content Size</th><th>Meaning
 | |
| </th></tr><tr><td valign="top" align="center">0</td><td valign="top" align="center">0</td><td align="left">
 | |
| Value is a NULL.
 | |
| </td></tr><tr><td valign="top" align="center">1</td><td valign="top" align="center">1</td><td align="left">
 | |
| Value is an 8-bit twos-complement integer.
 | |
| </td></tr><tr><td valign="top" align="center">2</td><td valign="top" align="center">2</td><td align="left">
 | |
| Value is a big-endian 16-bit twos-complement integer.
 | |
| </td></tr><tr><td valign="top" align="center">3</td><td valign="top" align="center">3</td><td align="left">
 | |
| Value is a big-endian 24-bit twos-complement integer.
 | |
| </td></tr><tr><td valign="top" align="center">4</td><td valign="top" align="center">4</td><td align="left">
 | |
| Value is a big-endian 32-bit twos-complement integer.
 | |
| </td></tr><tr><td valign="top" align="center">5</td><td valign="top" align="center">6</td><td align="left">
 | |
| Value is a big-endian 48-bit twos-complement integer.
 | |
| </td></tr><tr><td valign="top" align="center">6</td><td valign="top" align="center">8</td><td align="left">
 | |
| Value is a big-endian 64-bit twos-complement integer.
 | |
| </td></tr><tr><td valign="top" align="center">7</td><td valign="top" align="center">8</td><td align="left">
 | |
| Value is a big-endian IEEE 754-2008 64-bit floating point number.
 | |
| </td></tr><tr><td valign="top" align="center">8</td><td valign="top" align="center">0</td><td align="left">
 | |
| Value is the integer 0. (Only available for <a href="fileformat2.html#schemaformat">schema format</a> 4 and higher.)
 | |
| </td></tr><tr><td valign="top" align="center">9</td><td valign="top" align="center">0</td><td align="left">
 | |
| Value is the integer 1. (Only available for <a href="fileformat2.html#schemaformat">schema format</a> 4 and higher.)
 | |
| </td></tr><tr><td valign="top" align="center">10,11
 | |
|     </td><td valign="top" align="center"><i>variable</i></td><td align="left">
 | |
| <i>Reserved for internal use.  These serial type codes will
 | |
|    never appear in a well-formed database file, but they
 | |
|    might be used in transient and temporary database files
 | |
|    that SQLite sometimes generates for its own use.
 | |
|    The meanings of these codes can shift from one release
 | |
|    of SQLite to the next.</i>
 | |
| </td></tr><tr><td valign="top" align="center">N≥12 and even
 | |
|     </td><td valign="top" align="center">(N-12)/2</td><td align="left">
 | |
| Value is a BLOB that is (N-12)/2 bytes in length.
 | |
| </td></tr><tr><td valign="top" align="center">N≥13 and odd
 | |
|     </td><td valign="top" align="center">(N-13)/2</td><td align="left">
 | |
| Value is a string in the <a href="fileformat2.html#enc">text encoding</a> and (N-13)/2 bytes in length.
 | |
| The nul terminator is not stored.
 | |
| </td></tr></table></center>
 | |
| 
 | |
| <p>The header size varint
 | |
| and serial type varints will usually consist of a single byte.  The
 | |
| serial type varints for large strings and BLOBs might extend to two or three
 | |
| byte varints, but that is the exception rather than the rule. 
 | |
| The varint format is very efficient at coding the record header.</p>
 | |
| 
 | |
| <p>The values for each column in the record immediately follow the header.
 | |
| For serial types 0, 8, 9, 12, and 13, the value is zero bytes in
 | |
| length.  If all columns are of these types then the body section of the
 | |
| record is empty.</p>
 | |
| 
 | |
| <p>A record might have fewer values than the number of columns in the
 | |
| corresponding table.  This can happen, for example, after an
 | |
| <a href="lang_altertable.html">ALTER TABLE ... ADD COLUMN</a> SQL statement has increased
 | |
| the number of columns in the table schema without modifying preexisting rows
 | |
| in the table.
 | |
| Missing values at the end of the record are filled in using the
 | |
| <a href="lang_createtable.html#dfltval">default value</a> for the corresponding columns defined in the table schema.
 | |
| </p>
 | |
| 
 | |
| 
 | |
| <h2 id="record_sort_order"><span>2.2. </span>Record Sort Order</h2>
 | |
| 
 | |
| <p>The order of keys in an index b-tree is determined by the sort order of
 | |
| the records that the keys represent.  Record comparison progresses column
 | |
| by column.  Columns of a record are examined from left to right.  The
 | |
| first pair of columns that are not equal determines the relative order
 | |
| of the two records.  The sort order of individual columns is as
 | |
| follows:</p>
 | |
| 
 | |
| <ol>
 | |
| <li>NULL values (serial type 0) sort first.
 | |
| </li><li>Numeric values (serial types 1 through 9) sort after NULLs
 | |
|       and in numeric order.
 | |
| </li><li>Text values (odd serial types 13 and larger) sort after numeric
 | |
|     values in the order determined by the columns <a href="datatype3.html#collation">collating function</a>.
 | |
| </li><li>BLOB values (even serial types 12 and larger) sort last and in the order 
 | |
|     determined by memcmp().
 | |
| </li></ol>
 | |
| 
 | |
| <p>A <a href="datatype3.html#collation">collating function</a> for each column is necessary in order to compute
 | |
| the order of text fields.
 | |
| SQLite defines three built-in collating functions:
 | |
| </p>
 | |
| 
 | |
| <blockquote><table border="0" cellspacing="10">
 | |
| <tr><td valign="top">BINARY
 | |
|     </td><td> The built-in BINARY collation compares strings byte by byte
 | |
|         using the memcmp() function
 | |
|         from the standard C library.
 | |
| </td></tr><tr><td valign="top">NOCASE
 | |
|     </td><td> The NOCASE collation is like BINARY except that uppercase
 | |
|         ASCII characters ('A' through 'Z')
 | |
|         are folded into their lowercase equivalents prior to running the
 | |
|         comparison.  Only ASCII characters are case-folded.
 | |
|         NOCASE
 | |
|         does not implement a general purpose unicode caseless comparison.
 | |
| </td></tr><tr><td valign="top">RTRIM
 | |
|     </td><td> RTRIM is like BINARY except that extra spaces at the end of either
 | |
|          string do not change the result.  In other words, strings will
 | |
|          compare equal to one another as long as they
 | |
|          differ only in the number of spaces at the end.
 | |
| </td></tr></table></blockquote>
 | |
| 
 | |
| <p>Additional application-specific collating functions can be added to
 | |
| SQLite using the <a href="c3ref/create_collation.html">sqlite3_create_collation()</a> interface.</p>
 | |
| 
 | |
| <p>The default collating function for all strings is BINARY.
 | |
| Alternative collating functions for table columns can be specified in the
 | |
| <a href="lang_createtable.html">CREATE TABLE</a> statement using the COLLATE clause on the <a href="lang_createtable.html#tablecoldef">column definition</a>.
 | |
| When a column is indexed, the same collating function specified in the
 | |
| <a href="lang_createtable.html">CREATE TABLE</a> statement is used for the column in the index, by default,
 | |
| though this can be overridden using a COLLATE clause in the 
 | |
| <a href="lang_createindex.html">CREATE INDEX</a> statement.
 | |
| 
 | |
| </p><h2 id="representation_of_sql_tables"><span>2.3. </span>Representation Of SQL Tables</h2>
 | |
| 
 | |
| <p> Each ordinary SQL table in the database schema is represented on-disk
 | |
| by a table b-tree.  Each entry in the table b-tree corresponds to a row
 | |
| of the SQL table.  The <a href="lang_createtable.html#rowid">rowid</a> of the SQL table is the 64-bit signed
 | |
| integer key for each entry in the table b-tree.</p>
 | |
| 
 | |
| <p> The content of each SQL table row is stored in the database file by
 | |
| first combining the values in the various columns into a byte array
 | |
| in the record format, then storing that byte array as the payload in
 | |
| an entry in the table b-tree.  The order of values in the record is
 | |
| the same as the order of columns in the SQL table definition.
 | |
| When an SQL table includes an
 | |
| <a href="lang_createtable.html#rowid">INTEGER PRIMARY KEY</a> column (which aliases the <a href="lang_createtable.html#rowid">rowid</a>) then that
 | |
| column appears in the record as a NULL value.  SQLite will always use
 | |
| the table b-tree key rather than the NULL value when referencing the
 | |
| <a href="lang_createtable.html#rowid">INTEGER PRIMARY KEY</a> column.</p>
 | |
| 
 | |
| <p> If the <a href="datatype3.html#affinity">affinity</a> of a column is REAL and that column contains a
 | |
| value that can be converted to an integer without loss of information
 | |
| (if the value contains no fractional part and is not too large to be
 | |
| represented as an integer) then the column may be stored in the record
 | |
| as an integer.  SQLite will convert the value back to floating
 | |
| point when extracting it from the record.</p>
 | |
| 
 | |
| <h2 id="representation_of_without_rowid_tables"><span>2.4. </span>Representation of WITHOUT ROWID Tables</h2>
 | |
| 
 | |
| <p>If an SQL table is created using the "WITHOUT ROWID" clause at the
 | |
| end of its CREATE TABLE statement, then that table is a <a href="withoutrowid.html">WITHOUT ROWID</a>
 | |
| table and uses a different on-disk representation.  A WITHOUT ROWID
 | |
| table uses an index b-tree rather than a table b-tree for storage.
 | |
| The key for each entry in the WITHOUT ROWID b-tree is a record composed
 | |
| of the columns of the PRIMARY KEY followed by all remaining columns of
 | |
| the table.  The primary key columns appear in the order they they were
 | |
| declared in the PRIMARY KEY clause and the remaining columns appear in
 | |
| the order they occur in the CREATE TABLE statement.
 | |
| 
 | |
| </p><p>Hence, the content encoding for a WITHOUT ROWID table is the same
 | |
| as the content encoding for an ordinary rowid table, except that the
 | |
| order of the columns is rearranged so that PRIMARY KEY columns come
 | |
| first, and the content is used as the key in an index b-tree rather
 | |
| than as the data in a table b-tree.
 | |
| The special encoding rules for columns with REAL affinity
 | |
| apply to WITHOUT ROWID tables the same as they do with rowid tables.
 | |
| 
 | |
| </p><h3 id="suppression_of_redundant_columns_in_the_primary_key_of_without_rowid_tables"><span>2.4.1. </span>Suppression of redundant columns in the PRIMARY KEY
 | |
|     of WITHOUT ROWID tables</h3>
 | |
| 
 | |
| <p>If the PRIMARY KEY of a WITHOUT ROWID tables uses the same columns
 | |
| with the same collating sequence more than once, then the second and
 | |
| subsequent occurrences of that column in the PRIMARY KEY definition are
 | |
| ignored.  For example, the following CREATE TABLE statements all specify
 | |
| the same table, which will have the exact same representation on disk:
 | |
| 
 | |
| </p><blockquote><pre>
 | |
| CREATE TABLE t1(a,b,c,d,PRIMARY KEY(a,c)) WITHOUT ROWID);
 | |
| CREATE TABLE t1(a,b,c,d,PRIMARY KEY(a,c,a,c)) WITHOUT ROWID);
 | |
| CREATE TABLE t1(a,b,c,d,PRIMARY KEY(a,A,a,C)) WITHOUT ROWID);
 | |
| CREATE TABLE t1(a,b,c,d,PRIMARY KEY(a,a,a,a,c)) WITHOUT ROWID);
 | |
| </pre></blockquote>
 | |
| 
 | |
| <p>The first example above is the preferred definition of the table,
 | |
| of course.  All of the examples create a WITHOUT ROWID table with
 | |
| two PRIMARY KEY columns, "a" and "c", in that order, followed by
 | |
| two data columns "b" and "d", also in that order.
 | |
| 
 | |
| </p><h2 id="representation_of_sql_indices"><span>2.5. </span>Representation Of SQL Indices</h2>
 | |
| 
 | |
| <p>Each SQL index, whether explicitly declared via a <a href="lang_createindex.html">CREATE INDEX</a> statement
 | |
| or implied by a UNIQUE or PRIMARY KEY constraint, corresponds to an 
 | |
| index b-tree in the database file.
 | |
| Each entry in the index b-tree corresponds to a single row in the 
 | |
| associated SQL table.
 | |
| The key to an index b-tree is
 | |
| a record composed of the columns that are being indexed followed by the
 | |
| key of the corresponding table row.  For ordinary tables, the row key is
 | |
| the <a href="lang_createtable.html#rowid">rowid</a>, and for <a href="withoutrowid.html">WITHOUT ROWID</a> tables the row key is the PRIMARY KEY.
 | |
| Because every row in the table has a unique row key,
 | |
| all keys in an index are unique.</p>
 | |
| 
 | |
| <p>In a normal index, there is a one-to-one mapping between rows in a 
 | |
| table and entries in each index associated with that table.
 | |
| However, in a <a href="partialindex.html">partial index</a>, the index b-tree only contains entries
 | |
| corresponding to table rows for which the WHERE clause expression on the
 | |
| CREATE INDEX statement is true.
 | |
| Corresponding rows in the index and table b-trees share the same rowid
 | |
| or primary key values and contain the same value for all indexed columns.</p>
 | |
| 
 | |
| <h3 id="suppression_of_redundant_columns_in_without_rowid_secondary_indexes_"><span>2.5.1. </span>Suppression of redundant columns in WITHOUT ROWID secondary indexes
 | |
| </h3>
 | |
| 
 | |
| <p> In an index on a WITHOUT ROWID table, if a column of the PRIMARY KEY
 | |
| is also a column in the index and has a matching collating sequence, then the
 | |
| indexed column is not repeated in the table-key suffix on the
 | |
| end of the index record.  As an example, consider the following SQL:
 | |
| 
 | |
| </p><blockquote><pre>
 | |
| CREATE TABLE ex25(a,b,c,d,e,PRIMARY KEY(d,c,a)) WITHOUT rowid;
 | |
| CREATE INDEX ex25ce ON ex25(c,e);
 | |
| CREATE INDEX ex25acde ON ex25(a,c,d,e);
 | |
| CREATE INDEX ex25ae ON ex25(a COLLATE nocase,e);
 | |
| </pre></blockquote>
 | |
| 
 | |
| <p>Each row in the ex25ce index is a record
 | |
| with these columns: c, e, d, a.  The first two columns are
 | |
| the columns being indexed, c and e.  The remaining columns are the primary
 | |
| key of the corresponding table row.  Normally, the primary key would be
 | |
| columns d, c, and a, but because column c already appears earlier in the
 | |
| index, it is omitted from the key suffix.</p>
 | |
| 
 | |
| <p>In the extreme case where the columns being indexed cover all columns
 | |
| of the PRIMARY KEY, the index will consist of only the columns being
 | |
| indexed.  The ex25acde example above demonstrates this.  Each entry in
 | |
| the ex25acde index consists of only the columns a, c, d, and e, in that
 | |
| order.</p>
 | |
| 
 | |
| <p>Each row in ex25ae contains five columns: a, e, d, c, a.  The "a"
 | |
| column is repeated since the first occurrence of "a" has a collating
 | |
| function of "nocase" and the second has a collating sequence of "binary".
 | |
| If the "a" column is not repeated and if the table contains two or more
 | |
| entries with the same "e" value and where "a" differs only in case, then
 | |
| all of those table entries would correspond to a single entry in the
 | |
| index, which would break the one-to-one correspondence between the table
 | |
| and the index.
 | |
| 
 | |
| </p><p> The suppression of redundant columns in the key suffix of an index
 | |
| entry only occurs in WITHOUT ROWID tables.  In an ordinary rowid table,
 | |
| the index entry always ends with the rowid even if the <a href="lang_createtable.html#rowid">INTEGER PRIMARY KEY</a>
 | |
| column is one of the columns being indexed.</p>
 | |
| 
 | |
| <a name="ffschema"></a>
 | |
| 
 | |
| <h2 id="storage_of_the_sql_database_schema"><span>2.6. </span>Storage Of The SQL Database Schema</h2>
 | |
| 
 | |
| <p>Page 1 of a database file is the root page of a table b-tree that
 | |
| holds a special table named "<a href="schematab.html">sqlite_schema</a>".  This b-tree is known
 | |
| as the "schema table" since it stores the complete
 | |
| database schema.  The structure of the sqlite_schema table is as
 | |
| if it had been created using the following SQL:</p>
 | |
| 
 | |
| <blockquote><pre>
 | |
| CREATE TABLE sqlite_schema(
 | |
|   type text,
 | |
|   name text,
 | |
|   tbl_name text,
 | |
|   rootpage integer,
 | |
|   sql text
 | |
| );
 | |
| </pre></blockquote>
 | |
| 
 | |
| <p>The sqlite_schema table contains one row for each table, index, view,
 | |
| and trigger (collectively "objects") in the database schema, except there
 | |
| is no entry for the sqlite_schema table itself.  The sqlite_schema table
 | |
| contains entries for <a href="fileformat2.html#intschema">internal schema objects</a> in addition to application-
 | |
| and programmer-defined objects.
 | |
| 
 | |
| 
 | |
| </p><p>The sqlite_schema.type column will be one
 | |
| of the following text strings:  'table', 'index', 'view', or 'trigger'
 | |
| according to the type of object defined.  The 'table' string is used
 | |
| for both ordinary and <a href="vtab.html">virtual tables</a>.</p>
 | |
| 
 | |
| <p>The sqlite_schema.name column will hold the name of the object.
 | |
| <a href="lang_createtable.html#uniqueconst">UNIQUE</a> and <a href="lang_createtable.html#primkeyconst">PRIMARY KEY</a> constraints on tables cause SQLite to create
 | |
| <a href="fileformat2.html#intschema">internal indexes</a> with names of the form "sqlite_autoindex_TABLE_N"
 | |
| where TABLE is replaced by the name of the table that contains the
 | |
| constraint and N is an integer beginning with 1 and increasing by one
 | |
| with each constraint seen in the table definition.
 | |
| In a <a href="withoutrowid.html">WITHOUT ROWID</a> table, there is no sqlite_schema entry for the
 | |
| PRIMARY KEY, but the "sqlite_autoindex_TABLE_N" name is set aside
 | |
| for the PRIMARY KEY as if the sqlite_schema entry did exist.  This
 | |
| will affect the numbering of subsequent UNIQUE constraints.
 | |
| The "sqlite_autoindex_TABLE_N" name is never allocated for an
 | |
| <a href="lang_createtable.html#rowid">INTEGER PRIMARY KEY</a>, either in rowid tables or WITHOUT ROWID tables.
 | |
| </p>
 | |
| 
 | |
| <p>The sqlite_schema.tbl_name column holds the name of a table or view
 | |
| that the object is associated with.  For a table or view, the
 | |
| tbl_name column is a copy of the name column.  For an index, the tbl_name
 | |
| is the name of the table that is indexed.  For a trigger, the tbl_name
 | |
| column stores the name of the table or view that causes the trigger 
 | |
| to fire.</p>
 | |
| 
 | |
| <p>The sqlite_schema.rootpage column stores the page number of the root
 | |
| b-tree page for tables and indexes.  For rows that define views, triggers,
 | |
| and virtual tables, the rootpage column is 0 or NULL.</p>
 | |
| 
 | |
| <p>The sqlite_schema.sql column stores SQL text that describes the
 | |
| object.  This SQL text is a <a href="lang_createtable.html">CREATE TABLE</a>, <a href="lang_createvtab.html">CREATE VIRTUAL TABLE</a>,
 | |
| <a href="lang_createindex.html">CREATE INDEX</a>,
 | |
| <a href="lang_createview.html">CREATE VIEW</a>, or <a href="lang_createtrigger.html">CREATE TRIGGER</a> statement that if evaluated against
 | |
| the database file when it is the main database of a <a href="c3ref/sqlite3.html">database connection</a>
 | |
| would recreate the object.  The text is usually a copy of the original
 | |
| statement used to create the object but with normalizations applied so
 | |
| that the text conforms to the following rules:
 | |
| 
 | |
| </p><ul>
 | |
| <li>The CREATE, TABLE, VIEW, TRIGGER, and INDEX keywords at the beginning
 | |
| of the statement are converted to all upper case letters.
 | |
| </li><li>The TEMP or TEMPORARY keyword is removed if it occurs after the 
 | |
| initial CREATE keyword.
 | |
| </li><li>Any database name qualifier that occurs prior to the name of the
 | |
| object being created is removed.
 | |
| </li><li>Leading spaces are removed.
 | |
| </li><li>All spaces following the first two keywords are converted into a single
 | |
| space.
 | |
| </li></ul>
 | |
| 
 | |
| <p>The text in the sqlite_schema.sql column is a copy of the original
 | |
| CREATE statement text that created the object, except normalized as
 | |
| described above and as modified by subsequent <a href="lang_altertable.html">ALTER TABLE</a> statements.
 | |
| The sqlite_schema.sql is NULL for the <a href="fileformat2.html#intschema">internal indexes</a> that are
 | |
| automatically created by <a href="lang_createtable.html#uniqueconst">UNIQUE</a> or <a href="lang_createtable.html#primkeyconst">PRIMARY KEY</a> constraints.</p>
 | |
| 
 | |
| <h3 id="alternative_names_for_the_schema_table"><span>2.6.1. </span>Alternative Names For The Schema Table</h3>
 | |
| 
 | |
| <p>The name "sqlite_schema" does not appear anywhere in the file format.
 | |
| That name is just a convention used by the database implementation.
 | |
| Due to historical and operational considerations, the
 | |
| "sqlite_schema" table can also sometimes be called by one of the
 | |
| following aliases:
 | |
| 
 | |
| </p><ol>
 | |
| <li> sqlite_master
 | |
| </li><li> sqlite_temp_schema
 | |
| </li><li> sqlite_temp_master
 | |
| </li></ol>
 | |
| 
 | |
| <p>Because the name of the schema table does not appear anywhere in
 | |
| the file format, the meaning of the database file is not changed if
 | |
| the application chooses to refer to the schema table by one of
 | |
| these alternative names.
 | |
| 
 | |
| <a name="intschema"></a>
 | |
| 
 | |
| </p><h3 id="internal_schema_objects"><span>2.6.2. </span>Internal Schema Objects</h3>
 | |
| 
 | |
| <p>In addition to the tables, indexes, views, and triggers created by
 | |
| the application and/or the developer using CREATE statements SQL, the
 | |
| sqlite_schema table may contain zero or more entries for 
 | |
| <i>internal schema objects</i> that are created by SQLite for its 
 | |
| own internal use.  The names of internal schema objects
 | |
| always begin with "sqlite_" and any table, index, view, or trigger
 | |
| whose name begins with "sqlite_" is an internal schema object.
 | |
| SQLite prohibits applications from creating objects whose names begin
 | |
| with "sqlite_".  
 | |
| 
 | |
| </p><p>Internal schema objects used by SQLite may include the following:
 | |
| 
 | |
| </p><ul>
 | |
| <li><p>Indices with names of the form "sqlite_autoindex_TABLE_N" that
 | |
|        are used to implement <a href="lang_createtable.html#uniqueconst">UNIQUE</a> and <a href="lang_createtable.html#primkeyconst">PRIMARY KEY</a> constraints on
 | |
|        ordinary tables.
 | |
| 
 | |
| </p></li><li><p>A table with the name "sqlite_sequence" that is used to keep track
 | |
|        of the maximum historical <a href="lang_createtable.html#rowid">INTEGER PRIMARY KEY</a> for a table
 | |
|        using <a href="autoinc.html">AUTOINCREMENT</a>.
 | |
| 
 | |
| </p></li><li><p>Tables with names of the form "sqlite_statN" where N is an integer.
 | |
|        Such tables store database statistics gathered by the <a href="lang_analyze.html">ANALYZE</a>
 | |
|        command and used by the query planner to help determine the best
 | |
|        algorithm to use for each query.
 | |
| </p></li></ul>
 | |
| 
 | |
| <p>New internal schema objects names, always beginning with "sqlite_",
 | |
| may be added to the SQLite file format in future releases.
 | |
| 
 | |
| <a name="seqtab"></a>
 | |
| 
 | |
| </p><h3 id="the_sqlite_sequence_table"><span>2.6.3. </span>The sqlite_sequence table</h3>
 | |
| 
 | |
| <p>The sqlite_sequence table is an internal table used to help implement
 | |
| <a href="autoinc.html">AUTOINCREMENT</a>.  The sqlite_sequence table is created automatically
 | |
| whenever any ordinary table with an AUTOINCREMENT integer primary
 | |
| key is created.  Once created, the sqlite_sequence table exists in the
 | |
| sqlite_schema table forever; it cannot be dropped.
 | |
| The schema for the sqlite_sequence table is:
 | |
| 
 | |
| </p><blockquote><pre>
 | |
| CREATE TABLE sqlite_sequence(name,seq);
 | |
| </pre></blockquote>
 | |
| 
 | |
| <p>There is a single row in the sqlite_sequence table for each ordinary
 | |
| table that uses AUTOINCREMENT.  The name of the table (as it appears in
 | |
| sqlite_schema.name) is in the sqlite_sequence.main field and the largest
 | |
| <a href="lang_createtable.html#rowid">INTEGER PRIMARY KEY</a> ever inserted into that table is 
 | |
| in the sqlite_sequence.seq field.  
 | |
| New automatically generated integer primary keys for AUTOINCREMENT
 | |
| tables are guaranteed to be larger than the sqlite_sequence.seq field for
 | |
| that table.
 | |
| If the sqlite_sequence.seq field of an AUTOINCREMENT table is already at
 | |
| the largest integer value (9223372036854775807) then attempts to add new
 | |
| rows to that table with an automatically generated integer primary will fail
 | |
| with an <a href="rescode.html#full">SQLITE_FULL</a> error.
 | |
| The sqlite_sequence.seq field is automatically updated if required when
 | |
| new entries are inserted to an AUTOINCREMENT table.  
 | |
| The sqlite_sequence row for an AUTOINCREMENT table is automatically deleted
 | |
| when the table is dropped.
 | |
| If the sqlite_sequence row for an AUTOINCREMENT table does not exist when
 | |
| the AUTOINCREMENT table is updated, then a new sqlite_sequence row is created.
 | |
| If the sqlite_sequence.seq value for an AUTOINCREMENT table is manually 
 | |
| set to something other than an integer and there is a subsequent attempt to
 | |
| insert the or update the AUTOINCREMENT table, then the behavior is undefined.
 | |
| 
 | |
| </p><p>Application code is allowed to modify the sqlite_sequence table, to add
 | |
| new rows, to delete rows, or to modify existing rows.  However, application
 | |
| code cannot create the sqlite_sequence table if it does not already exist.
 | |
| Application code can delete all entries from the sqlite_sequence table,
 | |
| but application code cannot drop the sqlite_sequence table.
 | |
| 
 | |
| <a name="stat1tab"></a>
 | |
| 
 | |
| </p><h3 id="the_sqlite_stat1_table"><span>2.6.4. </span>The sqlite_stat1 table</h3>
 | |
| 
 | |
| <p>The sqlite_stat1 is an internal table created by the <a href="lang_analyze.html">ANALYZE</a> command
 | |
| and used to hold supplemental information about tables and indexes that the
 | |
| query planner can use to help it find better ways of performing queries.
 | |
| Applications can update, delete from, insert into or drop the sqlite_stat1
 | |
| table, but may not create or alter the sqlite_stat1 table.
 | |
| The schema of the sqlite_stat1 table is as follows:
 | |
| 
 | |
| </p><blockquote><pre>
 | |
| CREATE TABLE sqlite_stat1(tbl,idx,stat);
 | |
| </pre></blockquote>
 | |
| 
 | |
| <p> There is normally one row per index, with the index identified by the
 | |
| name in the sqlite_stat1.idx column.  The sqlite_stat1.tbl column is
 | |
| the name of the table to which the index belongs.  In each such row, 
 | |
| the sqlite_stat.stat column will be
 | |
| a string consisting of a list of integers followed by zero or more 
 | |
| arguments.  The first integer in this
 | |
| list is the approximate number of rows in the index.  (The number of
 | |
| rows in the index is the same as the number of rows in the table,
 | |
| except for <a href="partialindex.html">partial indexes</a>.)
 | |
| The second integer is the approximate number of rows in the index
 | |
| that have the same value in the first column of the index.  The third 
 | |
| integer is the number number of rows in the index that have 
 | |
| the same value for the first two columns.  The N-th integer (for N>1) 
 | |
| is the estimated average number of rows in 
 | |
| the index which have the same value for the first N-1 columns.  For
 | |
| a K-column index, there will be K+1 integers in the stat column.  If
 | |
| the index is unique, then the last integer will be 1.
 | |
| 
 | |
| </p><p>The list of integers in the stat column can optionally be followed
 | |
| by arguments, each of which is a sequence of non-space characters.
 | |
| All arguments are preceded by a single space.
 | |
| Unrecognized arguments are silently ignored.
 | |
| 
 | |
| </p><p>If the "unordered" argument is present, then the query planner assumes
 | |
| that the index is unordered and will not use the index for a range query
 | |
| or for sorting.
 | |
| 
 | |
| </p><p>The "sz=NNN" argument (where NNN represents a sequence of 1 or more digits)
 | |
| means that the average row size over all records of the table or
 | |
| index is NNN bytes per row.  The SQLite query planner might use the
 | |
| estimated row size information provided by the "sz=NNN" token
 | |
| to help it choose smaller tables and indexes that require less disk I/O.
 | |
| 
 | |
| </p><p>The presence of the "noskipscan" token on the sqlite_stat1.stat field
 | |
| of an index prevents that index from being used with the
 | |
| <a href="optoverview.html#skipscan">skip-scan optimization</a>.
 | |
| 
 | |
| </p><p>New text tokens may be added to the end of the stat column in future
 | |
| enhancements to SQLite.  For compatibility, unrecognized tokens at the end
 | |
| of the stat column are silently ignored.
 | |
| 
 | |
| </p><p>If the sqlite_stat1.idx column is NULL, then the sqlite_stat1.stat
 | |
| column contains a single integer which is the approximate number of
 | |
| rows in the table identified by sqlite_stat1.tbl.
 | |
| If the sqlite_stat1.idx column is the same as the sqlite_stat1.tbl
 | |
| column, then the table is a <a href="withoutrowid.html">WITHOUT ROWID</a> table and the sqlite_stat1.stat
 | |
| field contains information about the index btree that implements the
 | |
| WITHOUT ROWID table.
 | |
| 
 | |
| <a name="stat2tab"></a>
 | |
| 
 | |
| </p><h3 id="the_sqlite_stat2_table"><span>2.6.5. </span>The sqlite_stat2 table</h3>
 | |
| 
 | |
| <p>The sqlite_stat2 is only created and is only used if SQLite is compiled
 | |
| with SQLITE_ENABLE_STAT2 and if the SQLite version number is between
 | |
| 3.6.18 (2009-09-11) and 3.7.8 (2011-09-19).
 | |
| The sqlite_stat2 table is neither read nor written by any
 | |
| version of SQLite before 3.6.18 nor after 3.7.8.
 | |
| The sqlite_stat2 table contains additional information
 | |
| about the distribution of keys within an index.
 | |
| The schema of the sqlite_stat2 table is as follows:
 | |
| 
 | |
| </p><blockquote><pre>
 | |
| CREATE TABLE sqlite_stat2(tbl,idx,sampleno,sample);
 | |
| </pre></blockquote>
 | |
| 
 | |
| <p>The sqlite_stat2.idx column and the sqlite_stat2.tbl column in each 
 | |
| row of the sqlite_stat2 table identify an index described by that row.
 | |
| There are usually 10 rows in the sqlite_stat2
 | |
| table for each index.
 | |
| 
 | |
| </p><p>The sqlite_stat2 entries for an index that have sqlite_stat2.sampleno
 | |
| between 0 and 9 inclusive are samples of the left-most key value in the
 | |
| index taken at evenly spaced points along the index.
 | |
| Let C be the number of rows in the index.
 | |
| Then the sampled rows are given by
 | |
| 
 | |
| </p><blockquote>
 | |
|      rownumber = (i*C*2 + C)/20
 | |
| </blockquote>
 | |
| 
 | |
| <p>The variable i in the previous expression varies between 0 and 9.
 | |
| Conceptually, the index space is divided into
 | |
| 10 uniform buckets and the samples are the middle row from each bucket.
 | |
| 
 | |
| </p><p>The format for sqlite_stat2 is recorded here for legacy reference.  
 | |
| Recent versions of SQLite no longer support sqlite_stat2 and the
 | |
| sqlite_stat2 table, if is exists, is simply ignored.
 | |
| 
 | |
| <a name="stat3tab"></a>
 | |
| 
 | |
| </p><h3 id="the_sqlite_stat3_table"><span>2.6.6. </span>The sqlite_stat3 table</h3>
 | |
| 
 | |
| <p>The sqlite_stat3 is only used if SQLite is compiled
 | |
| with <a href="compile.html#enable_stat3">SQLITE_ENABLE_STAT3</a> or <a href="compile.html#enable_stat4">SQLITE_ENABLE_STAT4</a>
 | |
| and if the SQLite version number is 3.7.9 (2011-11-01) or greater.
 | |
| The sqlite_stat3 table is neither read nor written by any
 | |
| version of SQLite before 3.7.9.
 | |
| If the <a href="compile.html#enable_stat4">SQLITE_ENABLE_STAT4</a> compile-time option is used and the
 | |
| SQLite version number is 3.8.1 (2013-10-17) or greater,
 | |
| then sqlite_stat3 might be read but not written.
 | |
| The sqlite_stat3 table contains additional information
 | |
| about the distribution of keys within an index, information that the
 | |
| query planner can use to devise better and faster query algorithms.
 | |
| The schema of the sqlite_stat3 table is as follows:
 | |
| 
 | |
| </p><blockquote><pre>
 | |
| CREATE TABLE sqlite_stat3(tbl,idx,nEq,nLt,nDLt,sample);
 | |
| </pre></blockquote>
 | |
| 
 | |
| <p>There are usually multiple entries in the sqlite_stat3 table for each index.
 | |
| The sqlite_stat3.sample column holds the value of the left-most field of an
 | |
| index identified by sqlite_stat3.idx and sqlite_stat3.tbl.
 | |
| The sqlite_stat3.nEq column holds the approximate
 | |
| number of entries in the index whose left-most column exactly matches
 | |
| the sample.
 | |
| The sqlite_stat3.nLt holds the approximate number of entries in the
 | |
| index whose left-most column is less than the sample.
 | |
| The sqlite_stat3.nDLt column holds the approximate
 | |
| number of distinct left-most entries in the index that are less than
 | |
| the sample.
 | |
| 
 | |
| </p><p>There can be an arbitrary number of sqlite_stat3 entries per index.
 | |
| The <a href="lang_analyze.html">ANALYZE</a> command will typically generate sqlite_stat3 tables
 | |
| that contain between 10 and 40 samples that are distributed across
 | |
| the key space and with large nEq values.
 | |
| 
 | |
| </p><p>In a well-formed sqlite_stat3 table, the samples for any single
 | |
| index must appear in the same order that they occur in the index.  
 | |
| In other words, if the entry with left-most column S1 is earlier in
 | |
| the index b-tree than the
 | |
| entry with left-most column S2, then in the sqlite_stat3 table, 
 | |
| sample S1 must have a smaller rowid than sample S2.
 | |
| 
 | |
| <a name="stat4tab"></a>
 | |
| 
 | |
| </p><h3 id="the_sqlite_stat4_table"><span>2.6.7. </span>The sqlite_stat4 table</h3>
 | |
| 
 | |
| <p>The sqlite_stat4 is only created and is only used if SQLite is compiled
 | |
| with <a href="compile.html#enable_stat4">SQLITE_ENABLE_STAT4</a> and if the SQLite version number is
 | |
| 3.8.1 (2013-10-17) or greater.
 | |
| The sqlite_stat4 table is neither read nor written by any
 | |
| version of SQLite before 3.8.1.
 | |
| The sqlite_stat4 table contains additional information
 | |
| about the distribution of keys within an index or the distribution of
 | |
| keys in the primary key of a <a href="withoutrowid.html">WITHOUT ROWID</a> table.
 | |
| The query planner can sometimes use the additional information in
 | |
| the sqlite_stat4 table to devise better and faster query algorithms.
 | |
| The schema of the sqlite_stat4 table is as follows:
 | |
| 
 | |
| </p><blockquote><pre>
 | |
| CREATE TABLE sqlite_stat4(tbl,idx,nEq,nLt,nDLt,sample);
 | |
| </pre></blockquote>
 | |
| 
 | |
| <p>There are typically between 10 to 40 entries in the sqlite_stat4 table for
 | |
| each index for which statistics are available, however these limits are
 | |
| not hard bounds.
 | |
| The meanings of the columns in the sqlite_stat4 table are as follows:
 | |
| 
 | |
| </p><center>
 | |
| <table border="0" width="100%" cellpadding="10">
 | |
| <tr><td valign="top" align="right">tbl:</td>
 | |
|     <td>The sqlite_stat4.tbl column holds name of the table that owns
 | |
|     the index that the row describes
 | |
| 
 | |
| </td></tr><tr><td valign="top" align="right">idx:</td>
 | |
|     <td>The sqlite_stat4.idx column holds name of the index that the
 | |
|     row describes, or in the case of
 | |
|     an sqlite_stat4 entry for a <a href="withoutrowid.html">WITHOUT ROWID</a> table, the
 | |
|     name of the table itself.
 | |
| 
 | |
| </td></tr><tr><td valign="top" align="right">sample:</td>
 | |
|     <td>The sqlite_stat4.sample column holds a BLOB
 | |
|     in the <a href="fileformat2.html#record_format">record format</a> that encodes the indexed columns followed by
 | |
|     the rowid for a rowid table or by the columns of the primary key 
 | |
|     for a WITHOUT ROWID table.
 | |
|     The sqlite_stat4.sample BLOB for the WITHOUT ROWID table itself 
 | |
|     contains just the columns of the primary key.
 | |
|     Let the number of columns encoded by the sqlite_stat4.sample blob be N.
 | |
|     For indexes on an ordinary rowid table, N will be one more than the number 
 | |
|     of columns indexed.
 | |
|     For indexes on WITHOUT ROWID tables, N will be the number of columns
 | |
|     indexed plus the number of columns in the primary key.
 | |
|     For a WITHOUT ROWID table, N will be the number of columns in the
 | |
|     primary key.
 | |
| 
 | |
| </td></tr><tr><td valign="top" align="right">nEq:</td>
 | |
|     <td>The sqlite_stat4.nEq column holds a list of N integers where 
 | |
|     the K-th integer is the approximate number of entries in the index
 | |
|     whose left-most K columns exactly match the K left-most columns
 | |
|     of the sample.
 | |
| 
 | |
| </td></tr><tr><td valign="top" align="right">nLt:</td>
 | |
|     <td>The sqlite_stat4.nLt column holds a list of N integers where
 | |
|     the K-th integer is the approximate number of entries in the
 | |
|     index whose K left-most columns are collectively less than the 
 | |
|     K left-most columns of the sample.
 | |
| 
 | |
| </td></tr><tr><td valign="top" align="right">nDLt:</td>
 | |
|     <td>The sqlite_stat4.nDLt column holds a list of N integers where
 | |
|     the K-th integer is the approximate
 | |
|     number of entries in the index that are distinct in the first K columns and
 | |
|     where the left-most K columns are collectively less than the left-most
 | |
|     K columns of the sample.
 | |
| </td></tr></table>
 | |
| </center>
 | |
| 
 | |
| <p>The sqlite_stat4 is a generalization of the sqlite_stat3 table.  The
 | |
| sqlite_stat3 table provides information about the left-most column of an
 | |
| index whereas the sqlite_stat4 table provides information about all columns
 | |
| of the index.
 | |
| 
 | |
| </p><p>There can be an arbitrary number of sqlite_stat4 entries per index.
 | |
| The <a href="lang_analyze.html">ANALYZE</a> command will typically generate sqlite_stat4 tables
 | |
| that contain between 10 and 40 samples that are distributed across
 | |
| the key space and with large nEq values.
 | |
| 
 | |
| </p><p>In a well-formed sqlite_stat4 table, the samples for any single
 | |
| index must appear in the same order that they occur in the index.  
 | |
| In other words, if entry S1 is earlier in the index b-tree than 
 | |
| entry S2, then in the sqlite_stat4 table, sample S1 must have a
 | |
| smaller rowid than sample S2.
 | |
| 
 | |
| <a name="rollbackjournal"></a>
 | |
| 
 | |
| </p><h1 id="the_rollback_journal"><span>3. </span>The Rollback Journal</h1>
 | |
| 
 | |
| <p>The rollback journal is a file associated with each SQLite database
 | |
| file that holds information used to restore the database file to its initial
 | |
| state during the course of a transaction.
 | |
| The rollback journal file is always located in the same 
 | |
| directory as the database
 | |
| file and has the same name as the database file but with the string
 | |
| "<tt>-journal</tt>" appended.  There can only be a single rollback journal
 | |
| associated with a give database and hence there can only be one write
 | |
| transaction open against a single database at one time.</p>
 | |
| 
 | |
| <p>If a transaction is aborted due to an application crash, an operating
 | |
| system crash, or a hardware power failure or crash, then the database may
 | |
| be left in an inconsistent state.  The next time SQLite attempts to open
 | |
| the database file, the presence of the rollback journal file will be 
 | |
| detected and the journal will be automatically played back to restore the
 | |
| database to its state at the start of the incomplete transaction.</p>
 | |
| 
 | |
| <p>A rollback journal is only considered to be valid if it exists and
 | |
| contains a valid header.  Hence a transaction can be committed in one
 | |
| of three ways:
 | |
| </p><ol>
 | |
| <li>The rollback journal file can be deleted,
 | |
| </li><li>The rollback journal file can be truncated to zero length, or
 | |
| </li><li>The header of the rollback journal can be overwritten with
 | |
| invalid header text (for example, all zeros).
 | |
| </li></ol>
 | |
| <p>
 | |
| These three ways of committing a transaction correspond to the DELETE,
 | |
| TRUNCATE, and PERSIST settings, respectively, of the <a href="pragma.html#pragma_journal_mode">journal_mode pragma</a>.
 | |
| </p>
 | |
| 
 | |
| 
 | |
| <p>A valid rollback journal begins with a header in the following format:</p>
 | |
| 
 | |
| <center>
 | |
| <i>Rollback Journal Header Format</i><br>
 | |
| <table width="80%" border="1">
 | |
| <tr><th>Offset</th><th>Size</th><th>Description
 | |
| </th></tr><tr><td valign="top" align="center">0
 | |
|     </td><td valign="top" align="center">8
 | |
|     </td><td>Header string:  0xd9, 0xd5, 0x05, 0xf9, 0x20, 0xa1, 0x63, 0xd7
 | |
| </td></tr><tr><td valign="top" align="center">8
 | |
|     </td><td valign="top" align="center">4
 | |
|     </td><td>The "Page Count" - The number of pages in the next segment of the 
 | |
|         journal, or -1 to
 | |
|         mean all content to the end of the file
 | |
| </td></tr><tr><td valign="top" align="center">12
 | |
|     </td><td valign="top" align="center">4
 | |
|     </td><td>A random nonce for the checksum
 | |
| </td></tr><tr><td valign="top" align="center">16
 | |
|     </td><td valign="top" align="center">4
 | |
|     </td><td>Initial size of the database in pages
 | |
| </td></tr><tr><td valign="top" align="center">20
 | |
|     </td><td valign="top" align="center">4
 | |
|     </td><td>Size of a disk sector assumed by the process that wrote this
 | |
|         journal.
 | |
| </td></tr><tr><td valign="top" align="center">24
 | |
|     </td><td valign="top" align="center">4
 | |
|     </td><td>Size of pages in this journal.
 | |
| </td></tr></table>
 | |
| </center>
 | |
| 
 | |
| <p>A rollback journal header is padded with zeros out to the size of a 
 | |
| single sector (as defined by the sector size integer at offset 20).
 | |
| The header is in a sector by itself so that if a power loss occurs while
 | |
| writing the sector, information that follows the header will be
 | |
| (hopefully) undamaged.</p>
 | |
| 
 | |
| <p>After the header and zero padding are zero or more page records.  Each
 | |
| page record stores a copy of the content of a page from the database file
 | |
| before it was changed.  The same page may not appear more than once
 | |
| within a single rollback journal.
 | |
| To rollback an incomplete transaction, a process
 | |
| has merely to read the rollback journal from beginning to end and
 | |
| write pages found in the journal back into the database file at the
 | |
| appropriate location.</p>
 | |
| 
 | |
| <p>Let the database page size (the value of the integer at offset 24 
 | |
| in the journal header) be N.
 | |
| Then the format of a page record is as follows:</p>
 | |
| 
 | |
| <center>
 | |
| <i>Rollback Journal Page Record Format</i><br>
 | |
| <table width="80%" border="1">
 | |
| <tr><th>Offset</th><th>Size</th><th>Description
 | |
| </th></tr><tr><td valign="top" align="center">0
 | |
|     </td><td valign="top" align="center">4
 | |
|     </td><td>The page number in the database file
 | |
| </td></tr><tr><td valign="top" align="center">4
 | |
|     </td><td valign="top" align="center">N
 | |
|     </td><td>Original content of the page prior to the start of the transaction
 | |
| </td></tr><tr><td valign="top" align="center">N+4
 | |
|     </td><td valign="top" align="center">4
 | |
|     </td><td>Checksum
 | |
| </td></tr></table>
 | |
| </center>
 | |
| 
 | |
| 
 | |
| <p>The checksum is an unsigned 32-bit integer computed as follows:</p>
 | |
| 
 | |
| <ol>
 | |
| <li>Initialize the checksum to the checksum nonce value found in the
 | |
| journal header at offset 12.
 | |
| </li><li>Initialize index X to be N-200 (where N is the size of a database page
 | |
| in bytes.
 | |
| </li><li>Interpret the byte at offset X into the page as an 8-bit unsigned integer
 | |
|  and add the value of that integer to the checksum.
 | |
| </li><li>Subtract 200 from X.
 | |
| </li><li>If X is greater than or equal to zero, go back to step 3.
 | |
| </li></ol>
 | |
| 
 | |
| <p>The checksum value is used to guard against incomplete writes of
 | |
| a journal page record following a power failure.  A different random nonce
 | |
| is used each time a transaction is started in order to minimize the risk
 | |
| that unwritten sectors might by chance contain data from the same page
 | |
| that was a part of prior journals.  By changing the nonce for each
 | |
| transaction, stale data on disk will still generate an incorrect checksum
 | |
| and be detected with high probability.  The checksum only uses a sparse sample
 | |
| of 32-bit words from the data record for performance reasons - design studies 
 | |
| during the planning phases of SQLite 3.0.0 showed
 | |
| a significant performance hit in checksumming the entire page.</p>
 | |
| 
 | |
| <p>Let the page count value at offset 8 in the journal header be M.
 | |
| If M is greater than zero then after M page records the journal file
 | |
| may be zero padded out to the next multiple of the sector size and another
 | |
| journal header may be inserted.  All journal headers within the same
 | |
| journal must contain the same database page size and sector size.</p>
 | |
| 
 | |
| <p>If M is -1 in the initial journal header, then the number of page records
 | |
| that follow is computed by computing how many page records will fit in
 | |
| the available space of the remainder of the journal file.</p>
 | |
| 
 | |
| <a name="walformat"></a>
 | |
| 
 | |
| <h1 id="the_write_ahead_log"><span>4. </span>The Write-Ahead Log</h1>
 | |
| 
 | |
| <p>Beginning with <a href="releaselog/3_7_0.html">version 3.7.0</a> (2010-07-21), 
 | |
| SQLite supports a new transaction
 | |
| control mechanism called "<a href="wal.html">write-ahead log</a>" or "<a href="wal.html">WAL</a>".
 | |
| When a database is in WAL mode, all connections to that database must
 | |
| use the WAL.  A particular database will use either a rollback journal
 | |
| or a WAL, but not both at the same time.
 | |
| The WAL is always located in the same directory as the database
 | |
| file and has the same name as the database file but with the string
 | |
| "<tt>-wal</tt>" appended.</p>
 | |
| 
 | |
| <h2 id="wal_file_format"><span>4.1. </span>WAL File Format</h2>
 | |
| 
 | |
| <p>A <a href="wal.html#walfile">WAL file</a> consists of a header followed by zero or more "frames".
 | |
| Each frame records the revised content of a single page from the
 | |
| database file.  All changes to the database are recorded by writing
 | |
| frames into the WAL.  Transactions commit when a frame is written that
 | |
| contains a commit marker.  A single WAL can and usually does record 
 | |
| multiple transactions.  Periodically, the content of the WAL is
 | |
| transferred back into the database file in an operation called a
 | |
| "checkpoint".</p>
 | |
| 
 | |
| <p>A single WAL file can be reused multiple times.  In other words, the
 | |
| WAL can fill up with frames and then be checkpointed and then new
 | |
| frames can overwrite the old ones.  A WAL always grows from beginning
 | |
| toward the end.  Checksums and counters attached to each frame are
 | |
| used to determine which frames within the WAL are valid and which
 | |
| are leftovers from prior checkpoints.</p>
 | |
| 
 | |
| <p>The WAL header is 32 bytes in size and consists of the following eight
 | |
| big-endian 32-bit unsigned integer values:</p>
 | |
| 
 | |
| <center>
 | |
| <i>WAL Header Format</i><br>
 | |
| <table width="80%" border="1">
 | |
| <tr><th>Offset</th><th>Size</th><th>Description
 | |
| </th></tr><tr><td valign="top" align="center">0</td><td valign="top" align="center">4
 | |
|     </td><td>Magic number.  0x377f0682 or 0x377f0683
 | |
| </td></tr><tr><td valign="top" align="center">4</td><td valign="top" align="center">4
 | |
|     </td><td>File format version.  Currently 3007000.
 | |
| </td></tr><tr><td valign="top" align="center">8</td><td valign="top" align="center">4
 | |
|     </td><td>Database page size.  Example: 1024
 | |
| </td></tr><tr><td valign="top" align="center">12</td><td valign="top" align="center">4
 | |
|     </td><td>Checkpoint sequence number
 | |
| </td></tr><tr><td valign="top" align="center">16</td><td valign="top" align="center">4
 | |
|     </td><td>Salt-1: random integer incremented with each checkpoint
 | |
| </td></tr><tr><td valign="top" align="center">20</td><td valign="top" align="center">4
 | |
|     </td><td>Salt-2: a different random number for each checkpoint
 | |
| </td></tr><tr><td valign="top" align="center">24</td><td valign="top" align="center">4
 | |
|     </td><td>Checksum-1: First part of a checksum on the first 24 bytes of header
 | |
| </td></tr><tr><td valign="top" align="center">28</td><td valign="top" align="center">4
 | |
|     </td><td>Checksum-2: Second part of the checksum on the first 24 bytes of header
 | |
| </td></tr></table>
 | |
| </center>
 | |
| 
 | |
| <p>Immediately following the wal-header are zero or more frames. Each
 | |
| frame consists of a 24-byte frame-header followed by a <i>page-size</i> bytes
 | |
| of page data. The frame-header is six big-endian 32-bit unsigned 
 | |
| integer values, as follows:
 | |
| 
 | |
| </p><center>
 | |
| <i>WAL Frame Header Format</i><br>
 | |
| <table width="80%" border="1">
 | |
| <tr><th>Offset</th><th>Size</th><th>Description
 | |
| </th></tr><tr><td valign="top" align="center">0</td><td valign="top" align="center">4
 | |
|     </td><td>Page number
 | |
| </td></tr><tr><td valign="top" align="center">4</td><td valign="top" align="center">4
 | |
|     </td><td>For commit records, the size of the database file in pages
 | |
|         after the commit.  For all other records, zero.
 | |
| </td></tr><tr><td valign="top" align="center">8</td><td valign="top" align="center">4
 | |
|     </td><td>Salt-1 copied from the WAL header
 | |
| </td></tr><tr><td valign="top" align="center">12</td><td valign="top" align="center">4
 | |
|     </td><td>Salt-2 copied from the WAL header
 | |
| </td></tr><tr><td valign="top" align="center">16</td><td valign="top" align="center">4
 | |
|     </td><td>Checksum-1:  Cumulative checksum up through and including this page
 | |
| </td></tr><tr><td valign="top" align="center">20</td><td valign="top" align="center">4
 | |
|     </td><td>Checksum-2:  Second half of the cumulative checksum.
 | |
| </td></tr></table>
 | |
| </center>
 | |
| 
 | |
| <p>A frame is considered valid if and only if the following conditions are
 | |
| true:</p>
 | |
| 
 | |
| <ol>
 | |
| <li><p>The salt-1 and salt-2 values in the frame-header match
 | |
|        salt values in the wal-header</p></li>
 | |
| 
 | |
| <li><p>The checksum values in the final 8 bytes of the frame-header
 | |
|        exactly match the checksum computed consecutively on the
 | |
|        first 24 bytes of the WAL header and the first 8 bytes and
 | |
|        the content of all frames
 | |
|        up to and including the current frame.</p></li>
 | |
| </ol>
 | |
| 
 | |
| <a name="walcksm"></a>
 | |
| 
 | |
| <h2 id="checksum_algorithm"><span>4.2. </span>Checksum Algorithm</h2>
 | |
| 
 | |
| <p>The checksum is computed by interpreting the input as
 | |
| an even number of unsigned 32-bit integers: x(0) through x(N).
 | |
| The 32-bit integers are big-endian if the
 | |
| magic number in the first 4 bytes of the WAL header is 0x377f0683 and
 | |
| the integers are little-endian if the magic number is 0x377f0682.
 | |
| The checksum values are always stored in the frame header in a
 | |
| big-endian format regardless of which byte order is used to compute
 | |
| the checksum.</p>
 | |
| 
 | |
| <p>The checksum algorithm only works for content which is a multiple of
 | |
| 8 bytes in length.  In other words, if the inputs are x(0) through x(N)
 | |
| then N must be odd.
 | |
| The checksum algorithm is as follows:
 | |
| 
 | |
| </p><blockquote><pre> 
 | |
| s0 = s1 = 0
 | |
| for i from 0 to n-1 step 2:
 | |
|    s0 += x(i) + s1;
 | |
|    s1 += x(i+1) + s0;
 | |
| endfor
 | |
| # result in s0 and s1
 | |
| </pre></blockquote>
 | |
| 
 | |
| <p>The outputs s0 and s1 are both weighted checksums using Fibonacci weights
 | |
| in reverse order.  (The largest Fibonacci weight occurs on the first element
 | |
| of the sequence being summed.)  The s1 value spans all 32-bit integer
 | |
| terms of the sequence whereas s0 omits the final term.</p>
 | |
| 
 | |
| <h2 id="checkpoint_algorithm"><span>4.3. </span>Checkpoint Algorithm</h2>
 | |
| 
 | |
| <p>On a <a href="wal.html#ckpt">checkpoint</a>, the WAL is first flushed to persistent storage using
 | |
| the xSync method of the <a href="c3ref/io_methods.html">VFS</a>. 
 | |
| Then valid content of the WAL is transferred into the database file.
 | |
| Finally, the database is flushed to persistent storage using another
 | |
| xSync method call.
 | |
| The xSync operations serve as write barriers - all writes launched
 | |
| before the xSync must complete before any write that launches after the
 | |
| xSync begins.</p>
 | |
| 
 | |
| <p>A checkpoint need not run to completion.  It might be that some
 | |
| readers are still using older transactions with data that is contained
 | |
| in the database file.  In that case, transferring content for newer
 | |
| transactions from the WAL file into the database would delete the content
 | |
| out from under readers still using the older transactions.  To avoid that,
 | |
| checkpoints only run to completion if all reader are using the
 | |
| last transaction in the WAL.
 | |
| 
 | |
| <a name="walreset"></a>
 | |
| 
 | |
| </p><h2 id="wal_reset"><span>4.4. </span>WAL Reset</h2>
 | |
| 
 | |
| <p>After a complete checkpoint, if no other connections are in transactions
 | |
| that use the WAL, then subsequent write transactions can
 | |
| overwrite the WAL file from the beginning.  This is called "resetting the
 | |
| WAL".  At the start of the first new
 | |
| write transaction, the WAL header salt-1 value is incremented
 | |
| and the salt-2 value is randomized.  These changes to the salts invalidate
 | |
| old frames in the WAL that have already been checkpointed but not yet
 | |
| overwritten, and prevent them from being checkpointed again.</p>
 | |
| 
 | |
| <p>The WAL file can optionally be truncated on a reset, but it need not be.
 | |
| Performance is usually a little better if the WAL is not truncated, since
 | |
| filesystems generally will overwrite an existing file faster than they
 | |
| will grow a file.
 | |
| 
 | |
| <a name="walread"></a>
 | |
| 
 | |
| </p><h2 id="reader_algorithm"><span>4.5. </span>Reader Algorithm</h2>
 | |
| 
 | |
| <p>To read a page from the database (call it page number P), a reader
 | |
| first checks the WAL to see if it contains page P.  If so, then the
 | |
| last valid instance of page P that is followed by a commit frame
 | |
| or is a commit frame itself becomes the value read.  If the WAL
 | |
| contains no copies of page P that are valid and which are a commit
 | |
| frame or are followed by a commit frame, then page P is read from
 | |
| the database file.</p>
 | |
| 
 | |
| <p>To start a read transaction, the reader records the number of value
 | |
| frames in the WAL as "mxFrame".  (<a href="walformat.html#mxframe">More detail</a>)
 | |
| The reader uses this recorded mxFrame value
 | |
| for all subsequent read operations.  New transactions can be appended
 | |
| to the WAL, but as long as the reader uses its original mxFrame value
 | |
| and ignores subsequently appended content, the reader will see a 
 | |
| consistent snapshot of the database from a single point in time.  
 | |
| This technique allows multiple concurrent readers to view different 
 | |
| versions of the database content simultaneously.</p>
 | |
| 
 | |
| <p>The reader algorithm in the previous paragraphs works correctly, but 
 | |
| because frames for page P can appear anywhere within the WAL, the
 | |
| reader has to scan the entire WAL looking for page P frames.  If the
 | |
| WAL is large (multiple megabytes is typical) that scan can be slow,
 | |
| and read performance suffers.  To overcome this problem, a separate
 | |
| data structure called the wal-index is maintained to expedite the
 | |
| search for frames of a particular page.</p>
 | |
| 
 | |
| <a name="walindexformat"></a>
 | |
| 
 | |
| <h2 id="wal_index_format"><span>4.6. </span>WAL-Index Format</h2>
 | |
| 
 | |
| <p>Conceptually, the wal-index is shared memory, though the current
 | |
| VFS implementations use a memory-mapped file for operating-system
 | |
| portability.  The memory-mapped
 | |
| file is in the same directory as the database and has the same name
 | |
| as the database with a "<tt>-shm</tt>" suffix appended.  Because
 | |
| the wal-index is shared memory, SQLite does not support 
 | |
| <a href="pragma.html#pragma_journal_mode">journal_mode=WAL</a> 
 | |
| on a network filesystem when clients are on different machines, as
 | |
| all clients of the database must be able to share the same memory.</p>
 | |
| 
 | |
| <p>The purpose of the wal-index is to answer this question quickly:</p>
 | |
| 
 | |
| <blockquote><i>
 | |
| Given a page number P and a maximum WAL frame index M,
 | |
| return the largest WAL frame index for page P that does not exceed M, 
 | |
| or return NULL if there are no frames for page P that do not exceed M.
 | |
| </i></blockquote>
 | |
| 
 | |
| <p>The <i>M</i> value in the previous paragraph is the "mxFrame" value
 | |
| defined in <a href="fileformat2.html#walread">section 4.4</a> that is read at the start 
 | |
| of a transaction and which defines the maximum frame from the WAL that 
 | |
| the reader will use.</p>
 | |
| 
 | |
| <p>The wal-index is transient.  After a crash, the wal-index is
 | |
| reconstructed from the original WAL file.  The VFS is required
 | |
| to either truncate or zero the header of the wal-index when the last
 | |
| connection to it closes.  Because the wal-index is transient, it can
 | |
| use an architecture-specific format; it does not have to be cross-platform.
 | |
| Hence, unlike the database and WAL file formats which store all values
 | |
| as big endian, the wal-index stores multi-byte values in the native
 | |
| byte order of the host computer.</p>
 | |
| 
 | |
| <p>This document is concerned with the persistent state of the database
 | |
| file, and since the wal-index is a transient structure, no further 
 | |
| information about the format of the wal-index will be provided here.
 | |
| Additional details on the format of the wal-index are contained in
 | |
| the separate <a href="walformat.html#walidxfmt">WAL-index File Format</a> document.</p>
 | |
| 
 |