869 lines
		
	
	
		
			37 KiB
		
	
	
	
		
			HTML
		
	
	
	
	
	
			
		
		
	
	
			869 lines
		
	
	
		
			37 KiB
		
	
	
	
		
			HTML
		
	
	
	
	
	
| <!DOCTYPE html>
 | |
| <html><head>
 | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0">
 | |
| <meta http-equiv="content-type" content="text/html; charset=UTF-8">
 | |
| <link href="sqlite.css" rel="stylesheet">
 | |
| <title>WAL-mode File Format</title>
 | |
| <!-- path= -->
 | |
| </head>
 | |
| <body>
 | |
| <div class=nosearch>
 | |
| <a href="index.html">
 | |
| <img class="logo" src="images/sqlite370_banner.gif" alt="SQLite" border="0">
 | |
| </a>
 | |
| <div><!-- IE hack to prevent disappearing logo --></div>
 | |
| <div class="tagline desktoponly">
 | |
| Small. Fast. Reliable.<br>Choose any three.
 | |
| </div>
 | |
| <div class="menu mainmenu">
 | |
| <ul>
 | |
| <li><a href="index.html">Home</a>
 | |
| <li class='mobileonly'><a href="javascript:void(0)" onclick='toggle_div("submenu")'>Menu</a>
 | |
| <li class='wideonly'><a href='about.html'>About</a>
 | |
| <li class='desktoponly'><a href="docs.html">Documentation</a>
 | |
| <li class='desktoponly'><a href="download.html">Download</a>
 | |
| <li class='wideonly'><a href='copyright.html'>License</a>
 | |
| <li class='desktoponly'><a href="support.html">Support</a>
 | |
| <li class='desktoponly'><a href="prosupport.html">Purchase</a>
 | |
| <li class='search' id='search_menubutton'>
 | |
| <a href="javascript:void(0)" onclick='toggle_search()'>Search</a>
 | |
| </ul>
 | |
| </div>
 | |
| <div class="menu submenu" id="submenu">
 | |
| <ul>
 | |
| <li><a href='about.html'>About</a>
 | |
| <li><a href='docs.html'>Documentation</a>
 | |
| <li><a href='download.html'>Download</a>
 | |
| <li><a href='support.html'>Support</a>
 | |
| <li><a href='prosupport.html'>Purchase</a>
 | |
| </ul>
 | |
| </div>
 | |
| <div class="searchmenu" id="searchmenu">
 | |
| <form method="GET" action="search">
 | |
| <select name="s" id="searchtype">
 | |
| <option value="d">Search Documentation</option>
 | |
| <option value="c">Search Changelog</option>
 | |
| </select>
 | |
| <input type="text" name="q" id="searchbox" value="">
 | |
| <input type="submit" value="Go">
 | |
| </form>
 | |
| </div>
 | |
| </div>
 | |
| <script>
 | |
| function toggle_div(nm) {
 | |
| var w = document.getElementById(nm);
 | |
| if( w.style.display=="block" ){
 | |
| w.style.display = "none";
 | |
| }else{
 | |
| w.style.display = "block";
 | |
| }
 | |
| }
 | |
| function toggle_search() {
 | |
| var w = document.getElementById("searchmenu");
 | |
| if( w.style.display=="block" ){
 | |
| w.style.display = "none";
 | |
| } else {
 | |
| w.style.display = "block";
 | |
| setTimeout(function(){
 | |
| document.getElementById("searchbox").focus()
 | |
| }, 30);
 | |
| }
 | |
| }
 | |
| function div_off(nm){document.getElementById(nm).style.display="none";}
 | |
| window.onbeforeunload = function(e){div_off("submenu");}
 | |
| /* Disable the Search feature if we are not operating from CGI, since */
 | |
| /* Search is accomplished using CGI and will not work without it. */
 | |
| if( !location.origin || !location.origin.match || !location.origin.match(/http/) ){
 | |
| document.getElementById("search_menubutton").style.display = "none";
 | |
| }
 | |
| /* Used by the Hide/Show button beside syntax diagrams, to toggle the */
 | |
| function hideorshow(btn,obj){
 | |
| var x = document.getElementById(obj);
 | |
| var b = document.getElementById(btn);
 | |
| if( x.style.display!='none' ){
 | |
| x.style.display = 'none';
 | |
| b.innerHTML='show';
 | |
| }else{
 | |
| x.style.display = '';
 | |
| b.innerHTML='hide';
 | |
| }
 | |
| return false;
 | |
| }
 | |
| </script>
 | |
| </div>
 | |
| <div class=fancy>
 | |
| <div class=nosearch>
 | |
| <div class="fancy_title">
 | |
| WAL-mode File Format
 | |
| </div>
 | |
| <div class="fancy_toc">
 | |
| <a onclick="toggle_toc()">
 | |
| <span class="fancy_toc_mark" id="toc_mk">►</span>
 | |
| Table Of Contents
 | |
| </a>
 | |
| <div id="toc_sub"><div class="fancy-toc1"><a href="#files_on_disk">1. Files On Disk</a></div>
 | |
| <div class="fancy-toc2"><a href="#the_main_database_file">1.1. The Main Database File</a></div>
 | |
| <div class="fancy-toc2"><a href="#the_write_ahead_log_or_wal_file">1.2. The Write-Ahead-Log or "-wal" File</a></div>
 | |
| <div class="fancy-toc2"><a href="#the_wal_index_or_shm_file">1.3. The Wal-Index or "-shm" file</a></div>
 | |
| <div class="fancy-toc2"><a href="#file_lifecycles">1.4. File Lifecycles</a></div>
 | |
| <div class="fancy-toc2"><a href="#variations">1.5. Variations</a></div>
 | |
| <div class="fancy-toc1"><a href="#the_wal_index_file_format">2. The WAL-Index File Format</a></div>
 | |
| <div class="fancy-toc2"><a href="#the_wal_index_header">2.1. The WAL-Index Header</a></div>
 | |
| <div class="fancy-toc3"><a href="#the_mxframe_field">2.1.1. The mxFrame field</a></div>
 | |
| <div class="fancy-toc3"><a href="#the_nbackfill_field">2.1.2. The nBackfill field</a></div>
 | |
| <div class="fancy-toc3"><a href="#wal_locks">2.1.3. WAL Locks</a></div>
 | |
| <div class="fancy-toc2"><a href="#wal_index_hash_tables">2.2. WAL-Index Hash Tables</a></div>
 | |
| <div class="fancy-toc2"><a href="#locking_matrix">2.3. Locking Matrix</a></div>
 | |
| <div class="fancy-toc3"><a href="#how_the_various_locks_are_used">2.3.1. How the various locks are used</a></div>
 | |
| <div class="fancy-toc3"><a href="#operations_that_require_locks_and_which_locks_those_operations_use">2.3.2. Operations that require locks and which locks those operations use</a></div>
 | |
| <div class="fancy-toc1"><a href="#recovery">3. Recovery</a></div>
 | |
| </div>
 | |
| </div>
 | |
| <script>
 | |
| function toggle_toc(){
 | |
| var sub = document.getElementById("toc_sub")
 | |
| var mk = document.getElementById("toc_mk")
 | |
| if( sub.style.display!="block" ){
 | |
| sub.style.display = "block";
 | |
| mk.innerHTML = "▼";
 | |
| } else {
 | |
| sub.style.display = "none";
 | |
| mk.innerHTML = "►";
 | |
| }
 | |
| }
 | |
| </script>
 | |
| </div>
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| <p>This document describes low-level details on how <a href="wal.html">WAL mode</a> is
 | |
| implemented on unix and windows.
 | |
| 
 | |
| </p><p>The separate <a href="fileformat2.html">file format</a> description provides details on the
 | |
| structure of a database file and of the write-head log file used in
 | |
| <a href="wal.html">WAL mode</a>.  But details of the locking protocol and of the format
 | |
| of the WAL-index are deliberately omitted since those details
 | |
| are left to discretion of individual <a href="vfs.html">VFS</a> implementations.  This
 | |
| document fills in those missing details for the unix and windows <a href="vfs.html">VFSes</a>.
 | |
| 
 | |
| </p><p>For completeness, some of the higher level formatting information
 | |
| contains in the <a href="fileformat2.html">file format</a> document and elsewhere is replicated here,
 | |
| when it pertains to WAL mode processing.
 | |
| 
 | |
| </p><h1 id="files_on_disk"><span>1. </span>Files On Disk</h1>
 | |
| 
 | |
| <p>When in active use, the state of a WAL mode database is described
 | |
| by three separate files:
 | |
| 
 | |
| </p><ol>
 | |
| <li> The main database file with an arbitrary name "X".
 | |
| </li><li> The write-ahead log file, usually named "X-wal".
 | |
| </li><li> The wal-index file, usually named "X-shm".
 | |
| </li></ol>
 | |
| 
 | |
| <h2 id="the_main_database_file"><span>1.1. </span>The Main Database File</h2>
 | |
| 
 | |
| <p>The format of the main database file is as described in the
 | |
| <a href="fileformat2.html">file format</a> document.  The <a href="fileformat2.html#vnums">file format version numbers</a> at offsets
 | |
| 18 and 19 into the main database must both be 2 to indicate that the
 | |
| database is in WAL mode.  The main database may have an arbitrary
 | |
| name allowed by the underlying filesystem.  No special file suffixes
 | |
| are required, though ".db", ".sqlite", and ".sqlite3" seem to be
 | |
| popular choices.
 | |
| 
 | |
| </p><h2 id="the_write_ahead_log_or_wal_file"><span>1.2. </span>The Write-Ahead-Log or "-wal" File</h2>
 | |
| <p>The write-ahead log or "wal" file is a roll-forward journal
 | |
| that records transactions that have been committed but not yet applied
 | |
| to the main database.  Details on the format of the wal file are
 | |
| describe in the <a href="fileformat2.html#walformat">WAL format</a> subsection of the main <a href="fileformat2.html">file format</a>
 | |
| document.  The wal file is named by appending the four characters
 | |
| "-wal" to the end of the name of the main database file.  Except
 | |
| on 8+3 filesystems, such names are not allowed, and in that case
 | |
| the file suffix is changed to ".WAL".  But as 8+3 filesystems are
 | |
| increasingly rare, that exceptional case can usually be ignored.
 | |
| 
 | |
| <a name="shm"></a>
 | |
| 
 | |
| </p><h2 id="the_wal_index_or_shm_file"><span>1.3. </span>The Wal-Index or "-shm" file</h2>
 | |
| <p>The wal-index file or "shm" file is not actually used as a file.
 | |
| Rather, individual database clients mmap the shm file and use it
 | |
| as shared memory for coordinating access to the database and as a cache
 | |
| for quickly locating frame within the wal file.  The name
 | |
| of the shm file is the main database file name with the four characters
 | |
| "-shm" appended.  Or, for 8+3 filesystems, the shm file is the main
 | |
| database file with the suffix changed to ".SHM".
 | |
| 
 | |
| </p><p>The shm does not contain any database content and is not required
 | |
| to recover the database following a crash.  For that reason, the first
 | |
| client to connect to a quiescent database will normally truncate the
 | |
| shm file if it exists.  Since the content of the shm file does not need
 | |
| to be preserved across a crash, the shm file is never fsync()-ed to disk.
 | |
| In fact, if there were a mechanism by which SQLite could tell the 
 | |
| operating system to never persist the shm file to disk but always hold
 | |
| it in cache memory, SQLite would use that mechanism to avoid any
 | |
| unnecessary disk I/O associated with the shm file.  However, no such
 | |
| mechanism exists in standard posix.
 | |
| 
 | |
| </p><p>Because the shm is only used to coordinate access between concurrent
 | |
| clients, the shm file is omitted if <a href="pragma.html#pragma_locking_mode">exclusive locking mode</a>
 | |
| is set, as an optimization.  When <a href="pragma.html#pragma_locking_mode">exclusive locking mode</a> is set,
 | |
| SQLite uses heap memory in place of the memory-mapped shm file.
 | |
| 
 | |
| </p><h2 id="file_lifecycles"><span>1.4. </span>File Lifecycles</h2>
 | |
| 
 | |
| <p>When a WAL mode database is in active use, all three of the above
 | |
| files usually exist.  Except, the Wal-Index file is omitted if
 | |
| <a href="pragma.html#pragma_locking_mode">exclusive locking mode</a> is set.
 | |
| 
 | |
| </p><p>If the last client using the database shuts down cleanly by
 | |
| calling <a href="c3ref/close.html">sqlite3_close()</a>, then a <a href="wal.html#ckpt">checkpoint</a> is run automatically
 | |
| in order to transfer all information from the wal file
 | |
| over into the main database, and both the shm file
 | |
| and the wal file are unlinked.  Thus, when the database is not in
 | |
| use by any client, it is usually the case that only the main
 | |
| database file exists on disk.
 | |
| However, if the last client did not call <a href="c3ref/close.html">sqlite3_close()</a> before it
 | |
| shut down, or if the last client to disconnect was a read-only client,
 | |
| then the final cleanup operation does not occur and the
 | |
| shm and wal files may still exist on disk even when the database is
 | |
| not in use.
 | |
| 
 | |
| </p><h2 id="variations"><span>1.5. </span>Variations</h2>
 | |
| 
 | |
| <p>When <a href="pragma.html#pragma_locking_mode">PRAGMA locking_mode=EXCLUSIVE</a> (exclusive locking mode) is set,
 | |
| only a single client is allowed to have the database open at one time.  Since
 | |
| only a single client can use the database, the shm file is omitted.
 | |
| The single client uses a buffer in heap memory as a substitute for the
 | |
| memory-mapped shm file.
 | |
| 
 | |
| </p><p>If a read/write client invokes
 | |
| <a href="c3ref/file_control.html">sqlite3_file_control</a>(<a href="c3ref/c_fcntl_begin_atomic_write.html#sqlitefcntlpersistwal">SQLITE_FCNTL_PERSIST_WAL</a>) prior to shutdown,
 | |
| then at shutdown a checkpoint is still run, but the shm file and wal
 | |
| file are not deleted.
 | |
| This allows subsequent read-only clients to connect to and read the
 | |
| database.
 | |
| 
 | |
| <a name="walidxfmt"></a>
 | |
| 
 | |
| </p><h1 id="the_wal_index_file_format"><span>2. </span>The WAL-Index File Format</h1>
 | |
| 
 | |
| <p>
 | |
| The WAL-index or "shm" file is used to coordinate access to the database
 | |
| by multiple clients, and as a cache to help clients quickly locate frames
 | |
| within the wal file.
 | |
| 
 | |
| </p><p>
 | |
| Because the shm file is not involved in recovery, the shm file does not
 | |
| need to be machine byte-order independent.  Hence, numeric values in
 | |
| the shm file are written in the native byte order of the host computer,
 | |
| rather than being converted into a specific cross-platform byte order as
 | |
| is done with the main database file and the wal file.
 | |
| 
 | |
| </p><p>
 | |
| The shm file consists of one or more hash tables, where each hash table
 | |
| is 32768 bytes in size.  Except, a 136-byte header is carved out of the
 | |
| front of the very first hash table, so the first hash table is only
 | |
| 32632 bytes in size.  The total size of the shm file is always a multiple
 | |
| of 32768.  In most cases, the total size of the shm file is exactly 32768
 | |
| bytes.  The shm file only needs to grow beyond a single hash table if
 | |
| when the wal file grows very large (more than 4079 frames).  Since the
 | |
| default <a href="c3ref/wal_autocheckpoint.html">automatic checkpoint threshold</a> is
 | |
| 1000, WAL files rare reach the 4079 threshold needed to make the shm file
 | |
| grow.
 | |
| 
 | |
| </p><h2 id="the_wal_index_header"><span>2.1. </span>The WAL-Index Header</h2>
 | |
| 
 | |
| <p>The first 136 bytes of the shm file are a header.  The shm header has
 | |
| three main divisions as follows:
 | |
| 
 | |
| </p><center>
 | |
| <i>WAL-Index Header Divisions</i><br>
 | |
| <table width="80%" border="1">
 | |
| <tr><th>Bytes</th><th>Description
 | |
| </th></tr><tr><td valign="top">0..47</td><td>First copy of the WAL Index Information</td>
 | |
| </tr><tr><td valign="top">48..95</td><td>Second copy of the WAL Index Information</td>
 | |
| </tr><tr><td valign="top">96..135</td><td>Checkpoint Information and Locks</td>
 | |
| </tr></table>
 | |
| </center>
 | |
| 
 | |
| <p>Individual fields of the shm header, except for the
 | |
| salt values copied from the WAL header, are unsigned integers
 | |
| in the native byte-order of the host machine.  The salt values
 | |
| are exact copies from the WAL header and are in whatever byte
 | |
| order is used by the WAL file.
 | |
| The size of integers may be 8, 16, 32, or 64 bits.
 | |
| A detailed breakout of the individual fields of the shm header
 | |
| follows:
 | |
| 
 | |
| </p><center>
 | |
| <i>WAL-Index Header Details</i><br>
 | |
| <table width="80%" border="1">
 | |
| <tr><th>Bytes</th><th>Name</th><th>Meaning</th></tr>
 | |
| <tr>
 | |
| <td>0..3</td><td>iVersion</td>
 | |
| <td>The WAL-index format version number.  Always 3007000.</td>
 | |
| </tr>
 | |
| <tr>
 | |
| <td>4..7</td><td> </td>
 | |
| <td>Unused padding space.  Must be zero.
 | |
| </td></tr>
 | |
| <tr>
 | |
| <td>8..11</td><td>iChange</td>
 | |
| <td>Unsigned integer counter, incremented with each transaction
 | |
| </td></tr>
 | |
| <tr>
 | |
| <td>12</td><td>isInit</td>
 | |
| <td>The "isInit" flag.  1 when the shm file has been initialized.
 | |
| </td></tr>
 | |
| <tr>
 | |
| <td>13</td><td>bigEndCksum</td>
 | |
| <td>True if the WAL file uses big-ending checksums.  0 if the WAL
 | |
|     uses little-endian checksums.
 | |
| </td></tr>
 | |
| <tr>
 | |
| <td>14..15</td><td>szPage</td>
 | |
| <td>The database page size in bytes, or 1 if the page size is 65536.
 | |
| </td></tr>
 | |
| <tr>
 | |
| <td>16..19</td><td>mxFrame</td>
 | |
| <td>Number of valid and committed frames in the WAL file.
 | |
| </td></tr>
 | |
| <tr>
 | |
| <td>20..23</td><td>nPage</td>
 | |
| <td>Size of the database file in pages.
 | |
| </td></tr>
 | |
| <tr>
 | |
| <td>24..31</td><td>aFrameCksum</td>
 | |
| <td>Checksum of the last frame in the WAL file.
 | |
| </td></tr>
 | |
| <tr>
 | |
| <td>32..39</td><td>aSalt</td>
 | |
| <td>The two salt value copied from the WAL file header.
 | |
| These values are in the byte-order of the WAL file, which
 | |
| might be different from the native byte-order of the
 | |
| machine.
 | |
| </td></tr>
 | |
| <tr>
 | |
| <td>40..47</td><td>aCksum</td>
 | |
| <td>A checksum over bytes 0 through 39 of this header.
 | |
| </td></tr>
 | |
| <tr>
 | |
| <td>48..95</td><td> </td>
 | |
| <td>A copy of bytes 0 through 47 of this header.
 | |
| </td></tr>
 | |
| <tr>
 | |
| <td>96..99</td><td>nBackfill</td>
 | |
| <td>Number of WAL frames that have already been backfilled into the database
 | |
|     by prior checkpoints
 | |
| </td></tr>
 | |
| <tr>
 | |
| <td>100..119</td><td>read-mark[0..4]
 | |
| </td><td>Five "read marks".  Each read mark is a 32-bit unsigned integer (4 bytes).
 | |
| </td></tr>
 | |
| <tr>
 | |
| <td>120..127</td><td> </td>
 | |
| <td>Unused space set aside for 8 file locks.
 | |
| </td></tr>
 | |
| <tr>
 | |
| <td>128..132</td><td>nBackfillAttempted</td>
 | |
| <td>Number of WAL frames that have attempted to be backfilled but which might
 | |
|     not have been backfilled successfully.
 | |
| </td></tr>
 | |
| <tr>
 | |
| <td>132..136</td><td> </td>
 | |
| <td>Unused space reserved for further expansion.
 | |
| </td></tr>
 | |
| </table>
 | |
| </center>
 | |
| 
 | |
| <a name="mxframe"></a>
 | |
| 
 | |
| <h3 id="the_mxframe_field"><span>2.1.1. </span>The mxFrame field</h3>
 | |
| 
 | |
| <p>The 32-bit unsigned integer at offset 16 (and repeated at offset 64)
 | |
| is the number of valid frames in the WAL.  Because WAL frame are numbered
 | |
| starting with 1, mxFrame is also the index of the last valid commit frame
 | |
| in the WAL. A commit frame is a frame that has a non-zero "size of database"
 | |
| value in bytes 4 through 7 of the frame header, and that indicates the end
 | |
| of a transaction.
 | |
| </p><p>
 | |
| When mxFrame field is zero, it indicates that the WAL is empty and that
 | |
| all content should be obtained directly from the database file.
 | |
| </p><p>
 | |
| When mxFrame is equal to <a href="walformat.html#nbackfill">nBackfill</a>, that indicates that all content
 | |
| in the WAL has been written back into the database.
 | |
| In that case, all content can be read directly from the database.
 | |
| Furthermore, the next writer is free to <a href="fileformat2.html#walreset">reset the WAL</a> if no other
 | |
| connections hold locks on WAL_READ_LOCK(N) for N>0.
 | |
| </p><p>
 | |
| The mxFrame value is always greater than or equal to both
 | |
| <a href="walformat.html#nbackfill">nBackfill</a> and nBackfillAttempted.
 | |
| 
 | |
| 
 | |
| <a name="nbackfill"></a>
 | |
| 
 | |
| </p><h3 id="the_nbackfill_field"><span>2.1.2. </span>The nBackfill field</h3>
 | |
| 
 | |
| <p>The 32-bit unsigned integer at offset 128 in the WAL-index header
 | |
| is called the "nBackfill".
 | |
| this field holds the number of frames in the WAL file which
 | |
| have been copied back into the main database.
 | |
| 
 | |
| </p><p>The nBackfill number is never greater than <a href="walformat.html#mxframe">mxFrame</a>.
 | |
| When nBackfill equals <a href="walformat.html#mxframe">mxFrame</a>, that means that the WAL content
 | |
| has been completely written back into the database and it is
 | |
| ok to <a href="fileformat2.html#walreset">reset the WAL</a> if there are no locks held on any of WAL_READ_LOCK(N)
 | |
| for N>0.
 | |
| 
 | |
| </p><p>The nBackfill can only be increased while holding the
 | |
| WAL_CKPT_LOCK.  However, nBackfill is changed to zero
 | |
| during a <a href="fileformat2.html#walreset">WAL reset</a>, and this happens while holding
 | |
| the WAL_WRITE_LOCK.
 | |
| 
 | |
| <a name="locks"></a>
 | |
| 
 | |
| </p><h3 id="wal_locks"><span>2.1.3. </span>WAL Locks</h3>
 | |
| 
 | |
| <p>Eight bytes of space are set aside in the header to support
 | |
| file locking using the xShmLock() method in the <a href="c3ref/io_methods.html">sqlite3_io_methods</a>
 | |
| object.  These eight bytes are never read nor written by SQLite since
 | |
| some VFSes (ex: Windows) might implement locks using mandatory file locks.
 | |
| 
 | |
| </p><p>These are the eight locks supported:
 | |
| 
 | |
| </p><center>
 | |
| <i>WAL-Index Locks Controlled By xShmLock()</i><br>
 | |
| <table border="1">
 | |
| <tr><th rowspan="2">Name</th><th colspan="2">Offset
 | |
| </th></tr><tr><th>xShmLock</th><th>File
 | |
| </th></tr><tr>
 | |
| <td>WAL_WRITE_LOCK
 | |
| </td><td>0
 | |
| </td><td>120
 | |
| </td></tr>
 | |
| 
 | |
| <tr>
 | |
| <td>WAL_CKPT_LOCK
 | |
| </td><td>1
 | |
| </td><td>121
 | |
| </td></tr>
 | |
| 
 | |
| <tr>
 | |
| <td>WAL_RECOVER_LOCK
 | |
| </td><td>2
 | |
| </td><td>122
 | |
| </td></tr>
 | |
| 
 | |
| <tr>
 | |
| <td>WAL_READ_LOCK(0)
 | |
| </td><td>3
 | |
| </td><td>123
 | |
| </td></tr>
 | |
| 
 | |
| <tr>
 | |
| <td>WAL_READ_LOCK(1)
 | |
| </td><td>4
 | |
| </td><td>124
 | |
| </td></tr>
 | |
| 
 | |
| <tr>
 | |
| <td>WAL_READ_LOCK(2)
 | |
| </td><td>5
 | |
| </td><td>125
 | |
| </td></tr>
 | |
| 
 | |
| <tr>
 | |
| <td>WAL_READ_LOCK(3)
 | |
| </td><td>6
 | |
| </td><td>126
 | |
| </td></tr>
 | |
| 
 | |
| <tr>
 | |
| <td>WAL_READ_LOCK(4)
 | |
| </td><td>7
 | |
| </td><td>127
 | |
| </td></tr>
 | |
| </table>
 | |
| </center>
 | |
| 
 | |
| 
 | |
| 
 | |
| <p><i>TBD:  More information about the header</i>
 | |
| 
 | |
| </p><h2 id="wal_index_hash_tables"><span>2.2. </span>WAL-Index Hash Tables</h2>
 | |
| 
 | |
| <p>The hash tables in the shm file are designed to answer the
 | |
| following question quickly:
 | |
| 
 | |
| </p><blockquote><i>
 | |
| FindFrame(P,M):
 | |
| Given a page number P and a maximum WAL frame index M,
 | |
| return the largest WAL frame index for page P that does not exceed M, 
 | |
| or return NULL if there are no frames for page P that do not exceed M.
 | |
| </i></blockquote>
 | |
| 
 | |
| <p>
 | |
| Let the datatypes "u8", "u16", and "u32" mean unsigned integers of
 | |
| length 8, 16, and 32 bits, respectively.  Then, the first 32768-byte unit
 | |
| of the shm file is organized as follows:
 | |
| 
 | |
| </p><blockquote><pre>
 | |
| u8 aWalIndexHeader[136];
 | |
| u32 aPgno[4062];
 | |
| u16 aHash[8192];
 | |
| </pre></blockquote>
 | |
| 
 | |
| <p>The second and all subsequent 32768-byte units of the shm file are
 | |
| like this:
 | |
| 
 | |
| </p><blockquote><pre>
 | |
| u32 aPgno[4096];
 | |
| u16 aHash[8192];
 | |
| </pre></blockquote>
 | |
| 
 | |
| <p>Collectively, the aPgno entries record the database page number stored
 | |
| in all frames of the WAL file.  The aPgno[0] entry on the first hash table
 | |
| records the database page number stored in the very first frame in the WAL 
 | |
| file. The aPgno[i] entry from the first hash table is the database page number
 | |
| for the i-th frame in the WAL file.  The aPgno[k] entry for the second
 | |
| hash table is the database page number for the (k+4062)-th frame in the
 | |
| WAL file.  The aPgno[k] entry for the n-th 32768-byte hash table in the
 | |
| shm file (for n>1) holds the database page number stored in the
 | |
| (k+4062+4096*(n-2))-th frame of the WAL file.
 | |
| 
 | |
| </p><p>Here is a slightly different way to describe the aPgno values:
 | |
| If you think of all aPgno values as a contiguous array, then
 | |
| the database page number stored in the i-th frame of the WAL file
 | |
| is stored in aPgno[i].  Of course, aPgno is not a contiguous array.
 | |
| The first 4062 entries are on the first 32768-byte unit of the shm
 | |
| file and subsequent values are in 4096 entry chunks in later units
 | |
| of the shm file.
 | |
| 
 | |
| </p><p>One way to compute FindFrame(P,M) would be to scan the aPgno
 | |
| array starting with the M-th entry and working backwards towards
 | |
| the beginning and return J where aPgno[J]==P.  Such an algorithm would
 | |
| work, and it would be faster than searching the whole WAL file for
 | |
| the latest frame with page number P.  But the search can be made
 | |
| much faster still by using the aHash structure.
 | |
| 
 | |
| </p><p>A database page number P is mapped into a hash value
 | |
| using the following hash function:
 | |
| 
 | |
| </p><blockquote>
 | |
| h = (P * 383)%8192
 | |
| </blockquote>
 | |
| 
 | |
| <p>This function maps every page number into an integer between 0
 | |
| and 8191 inclusive.  The aHash field of each 32768-byte shm file unit
 | |
| maps P values into indexes of the aPgno field of the same unit as
 | |
| follows:
 | |
| 
 | |
| </p><ol>
 | |
| <li> Compute the hash value: h = P * 383
 | |
| </li><li> Let X be the largest set of consecutive integers {h, h+1, h+2, ..., h+N}
 | |
|      such that for every j in X, aPgno[j%8192]!=0.  The X set will be empty
 | |
|      if aPgno[h%8192]==0.  The X set is easily computed by starting with
 | |
|      the value h%8192, and adding h%8192 to X and incrementing h until
 | |
|      encountering the first aPgno[h%8192] entry that is zero.
 | |
| </li><li> The set X contains the index in aPgno of every entry in the current
 | |
|      32768-byte unit of the shm file that might possible be a solution
 | |
|      to the FindFrame(P,M) function.  Each of these entries must be
 | |
|      checked separately to ensure that the aPgno value is P and that the
 | |
|      frame number does not exceed M.  The largest frame number that passes
 | |
|      those two tests is the answer.
 | |
| </li></ol>
 | |
| 
 | |
| <p>Each entry in the aPgno array has a single corresponding entry
 | |
| in the aHash array.  There are more available slots in aHash than
 | |
| there are in aPgno.  The unused slots in aHash are filled with zero.
 | |
| And since there are guaranteed to be unused slots in aHash, that means
 | |
| the loop that computes X is guaranteed to terminate.  The expected size
 | |
| of X is less than 2.  The worst case is that X will be the same as the
 | |
| number of entries in aPgno, in which case the algorithm runs at about
 | |
| the same speed as a linear scan of aPgno.  But that worst case performance
 | |
| is exceedingly rare.  Usually, the size of X will be small and the use
 | |
| of the aHash array allows one to compute FindFrame(P,M) much faster.
 | |
| 
 | |
| </p><p>Here is an alternative way of describing the hash look-up algorithm:
 | |
| Start with h = (P * 383)%8192 and look at aHash[h] and subsequent entries,
 | |
| wrapping around to zero when h reaches 8192, until finding an entry with
 | |
| aHash[h]==0.  All aPgno entries having a page number of P will have an
 | |
| index that is one of the aHash[h] values thusly computed.
 | |
| But not all the computed aHash[h] values will
 | |
| meet the matching criteria, so you must check them independently.  The
 | |
| speed advantage comes about because normally this set of h values is
 | |
| very small.
 | |
| 
 | |
| </p><p>Note that each 32768-byte unit of the shm file has its own aHash and
 | |
| aPgno arrays.  The aHash array for a single unit is only helpful in finding
 | |
| aPgno entries in that same unit.  The overall FindFrame(P,M) function
 | |
| needs to do hash lookups beginning with the latest unit and working
 | |
| backwards to the oldest unit until it finds an answer.
 | |
| 
 | |
| </p><h2 id="locking_matrix"><span>2.3. </span>Locking Matrix</h2>
 | |
| 
 | |
| <p>Access is coordinated in WAL mode using both the legacy DELETE-mode
 | |
| locks controlled by the xLock and xUnlock methods of the <a href="c3ref/io_methods.html">sqlite3_io_methods</a>
 | |
| object and the WAL locks controlled by the xShmLock method of the
 | |
| <a href="c3ref/io_methods.html">sqlite3_io_methods</a> object.
 | |
| 
 | |
| </p><p>Conceptually, there is just a single DELETE-mode lock.  The DELETE-mode
 | |
| lock for a single database connection can be in exactly one of the
 | |
| following states:
 | |
| </p><ol>
 | |
| <li value="'0'"> SQLITE_LOCK_NONE (unlocked)
 | |
| </li><li value="'1'"> SQLITE_LOCK_SHARED (reading)
 | |
| </li><li value="'2'"> SQLITE_LOCK_RESERVED (reading, waiting to write)
 | |
| </li><li value="'3'"> SQLITE_LOCK_PENDING (new readers blocked, waiting to write)
 | |
| </li><li value="'4'"> SQLITE_LOCK_EXCLUSIVE (writing)
 | |
| </li></ol>
 | |
| <p>The DELETE-mode locks are stored on the <a href="fileformat2.html#lockbyte">lock-byte page</a> of the
 | |
| main database file.
 | |
| Only SQLITE_LOCK_SHARED and SQLITE_LOCK_EXCLUSIVE are factors for WAL-mode
 | |
| databases.
 | |
| The other locking states are used in rollback-mode, but not in WAL-mode.
 | |
| 
 | |
| </p><p>The <a href="walformat.html#locks">WAL-mode locks</a> are described above.
 | |
| 
 | |
| </p><h3 id="how_the_various_locks_are_used"><span>2.3.1. </span>How the various locks are used</h3>
 | |
| 
 | |
| <p>The following rules show how each of the locks is used.
 | |
| 
 | |
| </p><ul>
 | |
| <li><p><b>SQLITE_LOCK_SHARED</b>
 | |
| </p><p>
 | |
| All connections hold SQLITE_LOCK_SHARED continuously while attached
 | |
| to a WAL-mode database.  This is true for both read/write connections
 | |
| and read-only connections.
 | |
| The SQLITE_LOCK_SHARED lock is held even by connections that are
 | |
| not within transaction.
 | |
| This is different from rollback mode, where the SQLITE_LOCK_SHARED is
 | |
| released at the end of each transaction.
 | |
| 
 | |
| </p></li><li><p><b>SQLITE_LOCK_EXCLUSIVE</b>
 | |
| </p><p>
 | |
| Connections hold an exclusive lock when change in between WAL mode
 | |
| and any of the various rollback-modes.  Connections might also attempt to
 | |
| obtain an EXCLUSIVE lock when they disconnect from WAL mode.  If
 | |
| a connection is able to obtain an EXCLUSIVE lock, that means it is the
 | |
| only connection to the database and so it may attempt to checkpoint
 | |
| and then delete the WAL-index and WAL files.
 | |
| </p><p>
 | |
| When a connection is holding a SHARED lock on the main database,
 | |
| that will prevent any other connection from acquiring the EXCLUSIVE
 | |
| lock, which in turn prevents the WAL-index and WAL files from being
 | |
| deleted out from under other users, and prevents a transition out of
 | |
| WAL-mode while other users are accessing the database in WAL-mode.
 | |
| 
 | |
| 
 | |
| </p></li><li><p><b>WAL_WRITE_LOCK</b>
 | |
| </p><p>
 | |
| The WAL_WRITE_LOCK is only locked exclusively.  There is never a shared
 | |
| lock taken on WAL_WRITE_LOCK.
 | |
| </p><p>
 | |
| An EXCLUSIVE WAL_WRITE_LOCK is held by any connection that is appending
 | |
| content to the end of the WAL.  Hence, only a single process at a time
 | |
| can append content to the WAL.  If a <a href="fileformat2.html#walreset">WAL reset</a> occurs as a consequence of
 | |
| a write, then the <a href="walformat.html#nbackfill">nBackfill</a> field of the WAL-index header is reset to
 | |
| zero while holding this lock.
 | |
| </p><p>An EXCLUSIVE is also held WAL_WRITE_LOCK, and on several other locking
 | |
| bytes, when a connection is running <a href="walformat.html#recovery">recovery</a> on the shared WAL-index.
 | |
| 
 | |
| </p></li><li><p><b>WAL_CKPT_LOCK</b>
 | |
| </p><p>
 | |
| The WAL_CKPT_LOCK is only locked exclusively.  There is never a shared
 | |
| lock taken on WAL_CKPT_LOCK.
 | |
| </p><p>
 | |
| An EXCLUSIVE WAL_CKPT_LOCK is held by any connection that is running
 | |
| a <a href="wal.html#ckpt">checkpoint</a>.  The <a href="walformat.html#nbackfill">nBackfill</a> field of the WAL-index header may be
 | |
| increased while holding this exclusive lock, but it may not be decreased.
 | |
| </p><p>An EXCLUSIVE is also held WAL_CKPT_LOCK, and on several other locking
 | |
| bytes, when a connection is running <a href="walformat.html#recovery">recovery</a> on the shared WAL-index.
 | |
| 
 | |
| </p></li><li><p><b>WAL_RECOVER_LOCK</b>
 | |
| </p><p>
 | |
| The WAL_RECOVER_LOCK is only locked exclusively.  There is never a shared
 | |
| lock taken on WAL_RECOVER_LOCK.
 | |
| </p><p>
 | |
| An EXCLUSIVE WAL_RECOVER_LOCK is held by any connection that is running
 | |
| <a href="walformat.html#recovery">recovery</a> to reconstruct the shared WAL-index.
 | |
| </p><p>
 | |
| A read-only connection that is rebuilding its private heap-memory WAL-index
 | |
| does not hold this lock.  (It cannot, since read-only connections are not
 | |
| allowed to hold any exclusive locks.)  This lock is only held when rebuilding
 | |
| the global shared WAL-index contained in the memory-mapped SHM file.
 | |
| </p><p>
 | |
| In addition to locking this byte, a connection running <a href="walformat.html#recovery">recovery</a> also
 | |
| gets an exclusive lock on all other WAL locks except for WAL_READ_LOCK(0).
 | |
| 
 | |
| </p></li><li><p><b>WAL_READ_LOCK(N)</b>
 | |
| </p><p>
 | |
| There are five separate read locks, numbers 0 through 4.
 | |
| Read locks may be either SHARED or EXCLUSIVE.
 | |
| Connections obtain a shared lock on one of the read locks bytes while
 | |
| they are within a transaction.
 | |
| Connections also obtain an exclusive lock on read locks, one at a time,
 | |
| for the brief moment while they are updating the values of the corresponding
 | |
| read-marks.
 | |
| Read locks 1 through 4 are held exclusively when running <a href="walformat.html#recovery">recovery</a>.
 | |
| </p><p>
 | |
| Each read lock byte corresponds to one of the five 32-bit read-mark
 | |
| integers located in bytes 100 through 119 of the WAL-index header, as
 | |
| follows:
 | |
| </p><center>
 | |
| <table border="1">
 | |
| <tr><th>Lock Name</th><th>Lock offset</th><th>Read-mark name</th><th>Read-mark offset
 | |
| </th></tr><tr><td>WAL_READ_LOCK(0)</td><td>123</td><td>read-mark[0]</td><td>100..103
 | |
| </td></tr><tr><td>WAL_READ_LOCK(1)</td><td>124</td><td>read-mark[1]</td><td>104..107
 | |
| </td></tr><tr><td>WAL_READ_LOCK(2)</td><td>125</td><td>read-mark[2]</td><td>108..111
 | |
| </td></tr><tr><td>WAL_READ_LOCK(3)</td><td>126</td><td>read-mark[3]</td><td>112..115
 | |
| </td></tr><tr><td>WAL_READ_LOCK(4)</td><td>127</td><td>read-mark[4]</td><td>116..119
 | |
| </td></tr></table>
 | |
| </center>
 | |
| <p>
 | |
| When a connection holds a shared lock on WAL_READ_LOCK(N), that is a
 | |
| promise by the connection that it will use the WAL and not the database
 | |
| file for any database pages that are modified by the first
 | |
| read-mark[N] entries in the WAL.
 | |
| The read-mark[0] is always zero.  If a connection holds a shared lock
 | |
| on WAL_READ_LOCK(0), that means the connection expects to be able to ignore
 | |
| the WAL and read any content it wants from the main database.
 | |
| If N>0 then the connection is free to use more of the WAL file beyond
 | |
| read-mark[N] if it wants to, up to the first mxFrame frames.
 | |
| But when a connection holds a shared lock on WAL_READ_LOCK(0), that is a
 | |
| promise that it will never read content from the WAL
 | |
| and will acquire all content directly from the main database.
 | |
| </p><p>
 | |
| When a checkpoint runs, if it sees a lock on WAL_READ_LOCK(N), then it
 | |
| must not move WAL content into the main database for more than the first
 | |
| read-mark[N] frames.  Were it to do so, it would overwrite content that
 | |
| the process holding the lock was expecting to be able to read out of the
 | |
| main database file.  A consequence of if this is that f the WAL file
 | |
| contains more than read-mark[N] frames (if mxFrame>read-mark[N]
 | |
| for any read-mark for which WAL_READ_LOCK(N) is held by another process),
 | |
| then the checkpoint cannot run to completion.
 | |
| </p><p>
 | |
| When a writer wants to <a href="fileformat2.html#walreset">reset the WAL</a>, it must ensure that there are
 | |
| no locks on WAL_READ_LOCK(N) for N>0 because such locks indicate
 | |
| that some other connection is still using the current WAL file and 
 | |
| a <a href="fileformat2.html#walreset">WAL reset</a> would delete content out from those other connections.  It is
 | |
| ok for a <a href="fileformat2.html#walreset">WAL reset</a> to occur if other connections are holding WAL_READ_LOCK(0)
 | |
| because by holding WAL_READ_LOCK(0), those other connections are promising
 | |
| not to use any content from the WAL.
 | |
| </p></li></ul>
 | |
| 
 | |
| <h3 id="operations_that_require_locks_and_which_locks_those_operations_use"><span>2.3.2. </span>Operations that require locks and which locks those operations use</h3>
 | |
| 
 | |
| <ul>
 | |
| <li><p><b>Transition into and out of WAL-mode</b>
 | |
| 
 | |
| </p><p>The SQLITE_LOCK_EXCLUSIVE lock must be held by a connection that wants
 | |
| to transition into our out of WAL mode.
 | |
| Transitioning into WAL mode is, therefore, just like any other write
 | |
| transaction, since every write transaction in rollback mode requires
 | |
| the SQLITE_LOCK_EXCLUSIVE lock.
 | |
| If the database file is already in WAL mode (hence if the desire it to change
 | |
| it back into rollback mode) and if there are two
 | |
| or more connections to the database, then each of these connections will
 | |
| be holding an SQLITE_LOCK_SHARED lock.  That means that the
 | |
| SQLITE_LOCK_EXCLUSIVE cannot be obtained, and the transition out of
 | |
| WAL mode will not be allowed.  This prevents one connection from deleting
 | |
| WAL mode out from under another.  It also means that the only way to move
 | |
| a database from WAL mode into rollback mode is to close all but one
 | |
| connection to the database.
 | |
| 
 | |
| 
 | |
| </p></li><li><p><b>Close a connection to a WAL mode database</b>
 | |
| 
 | |
| </p><p>When a database connection closes (via <a href="c3ref/close.html">sqlite3_close()</a> or
 | |
| <a href="c3ref/close.html">sqlite3_close_v2()</a>), an attempt is made to acquire
 | |
| SQLITE_LOCK_EXCLUSIVE.  If this attempt is successful, that means
 | |
| the connection that is closing is the last connection to the database.
 | |
| In that case, it is desirable to clean up the WAL and WAL-index files,
 | |
| so the closing connection runs a <a href="wal.html#ckpt">checkpoint</a> (while holding
 | |
| SQLITE_LOCK_EXCLUSIVE) and the deletes both the WAL and WAL-index files.
 | |
| The SQLITE_LOCK_EXCLUSIVE is not released until after both the
 | |
| WAL and WAL-index files have been deleted.
 | |
| 
 | |
| </p><p>If the application invokes
 | |
| <a href="c3ref/file_control.html">sqlite3_file_control</a>(<a href="c3ref/c_fcntl_begin_atomic_write.html#sqlitefcntlpersistwal">SQLITE_FCNTL_PERSIST_WAL</a>) on the database
 | |
| connection prior to closing, then the final checkpoint is still
 | |
| run but the WAL and WAL-index files are
 | |
| not deleted as they normally would be.
 | |
| This leaves the database in a state that allows other processes
 | |
| without write permission on the database, WAL, or WAL-index files
 | |
| to open the database read-only.
 | |
| If the WAL and WAL-index files are missing, then a process that
 | |
| lacks permission to create and initialize those files will not be
 | |
| able to open the database, unless the database is designated
 | |
| as immutable using the <a href="uri.html#uriimmutable">immutable query parameter</a>.
 | |
| 
 | |
| </p></li><li><p><b>Reconstruct the global shared WAL-index during <a href="walformat.html#recovery">recovery</a></b>
 | |
| 
 | |
| </p><p>All of the WAL-index locks, except for WAL_READ_LOCK(0),
 | |
| are held exclusively while reconstructing the global shared WAL-index
 | |
| during <a href="walformat.html#recovery">recovery</a>.  
 | |
| 
 | |
| </p></li><li><p><b>Append a new transaction to the end of the WAL</b>
 | |
| 
 | |
| </p><p>An exclusive lock is held on WAL_WRITE_LOCK while adding new
 | |
| frame onto the end of a WAL file.  
 | |
| 
 | |
| </p></li><li><p><b>Read content from the database and WAL as part of a transaction</b>
 | |
| 
 | |
| </p></li><li><p><b>Run a checkpoint</b>
 | |
| 
 | |
| </p></li><li><p><b>Reset the WAL file</b>
 | |
| 
 | |
| </p><p>A <a href="fileformat2.html#walreset">WAL reset</a> means to rewind the WAL and start adding new frames
 | |
| at the beginning.  This occurs while appending new frames to a WAL
 | |
| that has <a href="walformat.html#mxframe">mxFrame</a> equal to <a href="walformat.html#nbackfill">nBackfill</a> and which has no locks on
 | |
| WAL_READ_LOCK(1) through WAL_READ_LOCK(4).  The WAL_WRITE_LOCK is
 | |
| held.
 | |
| 
 | |
| </p></li></ul>
 | |
| 
 | |
| <a name="recovery"></a>
 | |
| 
 | |
| <h1 id="recovery"><span>3. </span>Recovery</h1>
 | |
| 
 | |
| <p>
 | |
| Recovery is the process of rebuilding the WAL-index so that it is
 | |
| synchronized with the WAL.
 | |
| 
 | |
| </p><p>
 | |
| Recovery is run by the first thread to connect to a WAL-mode database.
 | |
| Recovery restores the WAL-index so that it accurately describes the
 | |
| WAL file.  If there is no WAL file present when the first thread connects
 | |
| to the database, there is nothing to recover, but the recovery process
 | |
| still runs to initialize the WAL-index.
 | |
| 
 | |
| </p><p>
 | |
| If the WAL-index is implemented as a memory-mapped file and that file is
 | |
| read-only to the first thread to connect, then that thread creates an
 | |
| private heap-memory ersazt WAL-index and runs the recovery routine to
 | |
| populate that private WAL-index.  The same data results, but it is held
 | |
| privately rather that being written into the public shared memory area.
 | |
| 
 | |
| </p><p>
 | |
| Recovery works by doing a single pass over the WAL, from beginning to end.
 | |
| The checksums are verified on each frame of the WAL as it is read.  The
 | |
| scan stops at the end of the file or at the first invalid checksum.
 | |
| The <a href="walformat.html#mxframe">mxFrame</a> field is set to the index of the last valid commit frame
 | |
| in WAL.  Since WAL frame numbers are indexed starting with 1, mxFrame is
 | |
| also the number of valid frames in the WAL.  A "commit frame" is a frame
 | |
| that has a non-zero value in bytes 4 through 7 of the frame header.
 | |
| Since the recovery procedure has no way of knowing how many frames of the
 | |
| WAL might have previously been copied back into the database, it initializes
 | |
| the <a href="walformat.html#nbackfill">nBackfill</a> value to zero.
 | |
| 
 | |
| </p><p>
 | |
| During recovery of the global shared-memory WAL-index, exclusive locks are
 | |
| held on WAL_WRITE_LOCK, WAL_CKPT_LOCK, WAL_RECOVER_LOCK, and WAL_READ_LOCK(1) through WAL_READ_LOCK(4).  In other words, all locks associated with the
 | |
| WAL-index except for WAL_READ_LOCK(0) are held exclusively.  This prevents
 | |
| any other thread from writing the database and from reading any transactions
 | |
| that are held in the WAL, until the recovery is complete.
 | |
| </p>
 |