1981 lines
		
	
	
		
			109 KiB
		
	
	
	
		
			HTML
		
	
	
	
	
	
			
		
		
	
	
			1981 lines
		
	
	
		
			109 KiB
		
	
	
	
		
			HTML
		
	
	
	
	
	
| <!DOCTYPE html>
 | |
| <html><head>
 | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0">
 | |
| <meta http-equiv="content-type" content="text/html; charset=UTF-8">
 | |
| <link href="sqlite.css" rel="stylesheet">
 | |
| <title>No Title</title>
 | |
| <!-- path= -->
 | |
| </head>
 | |
| <body>
 | |
| <div class=nosearch>
 | |
| <a href="index.html">
 | |
| <img class="logo" src="images/sqlite370_banner.gif" alt="SQLite" border="0">
 | |
| </a>
 | |
| <div><!-- IE hack to prevent disappearing logo --></div>
 | |
| <div class="tagline desktoponly">
 | |
| Small. Fast. Reliable.<br>Choose any three.
 | |
| </div>
 | |
| <div class="menu mainmenu">
 | |
| <ul>
 | |
| <li><a href="index.html">Home</a>
 | |
| <li class='mobileonly'><a href="javascript:void(0)" onclick='toggle_div("submenu")'>Menu</a>
 | |
| <li class='wideonly'><a href='about.html'>About</a>
 | |
| <li class='desktoponly'><a href="docs.html">Documentation</a>
 | |
| <li class='desktoponly'><a href="download.html">Download</a>
 | |
| <li class='wideonly'><a href='copyright.html'>License</a>
 | |
| <li class='desktoponly'><a href="support.html">Support</a>
 | |
| <li class='desktoponly'><a href="prosupport.html">Purchase</a>
 | |
| <li class='search' id='search_menubutton'>
 | |
| <a href="javascript:void(0)" onclick='toggle_search()'>Search</a>
 | |
| </ul>
 | |
| </div>
 | |
| <div class="menu submenu" id="submenu">
 | |
| <ul>
 | |
| <li><a href='about.html'>About</a>
 | |
| <li><a href='docs.html'>Documentation</a>
 | |
| <li><a href='download.html'>Download</a>
 | |
| <li><a href='support.html'>Support</a>
 | |
| <li><a href='prosupport.html'>Purchase</a>
 | |
| </ul>
 | |
| </div>
 | |
| <div class="searchmenu" id="searchmenu">
 | |
| <form method="GET" action="search">
 | |
| <select name="s" id="searchtype">
 | |
| <option value="d">Search Documentation</option>
 | |
| <option value="c">Search Changelog</option>
 | |
| </select>
 | |
| <input type="text" name="q" id="searchbox" value="">
 | |
| <input type="submit" value="Go">
 | |
| </form>
 | |
| </div>
 | |
| </div>
 | |
| <script>
 | |
| function toggle_div(nm) {
 | |
| var w = document.getElementById(nm);
 | |
| if( w.style.display=="block" ){
 | |
| w.style.display = "none";
 | |
| }else{
 | |
| w.style.display = "block";
 | |
| }
 | |
| }
 | |
| function toggle_search() {
 | |
| var w = document.getElementById("searchmenu");
 | |
| if( w.style.display=="block" ){
 | |
| w.style.display = "none";
 | |
| } else {
 | |
| w.style.display = "block";
 | |
| setTimeout(function(){
 | |
| document.getElementById("searchbox").focus()
 | |
| }, 30);
 | |
| }
 | |
| }
 | |
| function div_off(nm){document.getElementById(nm).style.display="none";}
 | |
| window.onbeforeunload = function(e){div_off("submenu");}
 | |
| /* Disable the Search feature if we are not operating from CGI, since */
 | |
| /* Search is accomplished using CGI and will not work without it. */
 | |
| if( !location.origin || !location.origin.match || !location.origin.match(/http/) ){
 | |
| document.getElementById("search_menubutton").style.display = "none";
 | |
| }
 | |
| /* Used by the Hide/Show button beside syntax diagrams, to toggle the */
 | |
| function hideorshow(btn,obj){
 | |
| var x = document.getElementById(obj);
 | |
| var b = document.getElementById(btn);
 | |
| if( x.style.display!='none' ){
 | |
| x.style.display = 'none';
 | |
| b.innerHTML='show';
 | |
| }else{
 | |
| x.style.display = '';
 | |
| b.innerHTML='hide';
 | |
| }
 | |
| return false;
 | |
| }
 | |
| </script>
 | |
| </div>
 | |
| 
 | |
| <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
 | |
| <html>
 | |
| <head>
 | |
|   <link type="text/css" rel="stylesheet" href="images/fileformat/rtdocs.css">
 | |
|   <script type="text/javascript" src=images/fileformat/rtdocs.js></script>
 | |
| </head>
 | |
| <div id=document_title>SQLite File IO Specification</div>
 | |
| <div id=toc_header>Table Of Contents</div>
 | |
| <div id=toc>
 | |
|   <b>Javascript is required for some features of this document, including 
 | |
|      table of contents, figure numbering and internal references (section
 | |
|      numbers and hyper-links.
 | |
|   </b>
 | |
| </div>
 | |
| <!-- End of standard rt docs header -->
 | |
| <h1 id=overview>Overview</h1>
 | |
|   <p>
 | |
|     SQLite stores an entire database within a single file, the format of
 | |
|     which is described in the <i>SQLite File Database File Format</i> 
 | |
|     document <cite>ff_sqlitert_requirements</cite>. Each database file is
 | |
|     stored within a file system, presumably provided by the host operating
 | |
|     system. Instead of interfacing with the operating system directly, 
 | |
|     the host application is required to supply an adaptor component that 
 | |
|     implements the <i>SQLite Virtual File System</i> interface 
 | |
|     (described in <cite>capi_sqlitert_requirements</cite>). The adaptor
 | |
|     component is responsible for translating the calls made by SQLite to
 | |
|     the <i>VFS</i> interface into calls to the file-system interface 
 | |
|     provided by the operating system. This arrangement is depicted in figure
 | |
|     <cite>figure_vfs_role</cite>.
 | |
|     <center><img src="images/fileformat/vfs_role.gif">
 | |
|     <p><i>Figure <span class=fig id=figure_vfs_role></span> - Virtual File System (VFS) Adaptor</i>
 | |
|       </center>
 | |
|   <p>
 | |
|     Although it would be easy to design a system that uses the <i>VFS</i>
 | |
|     interface to read and update the content of a database file stored
 | |
|     within a file-system, there are several complicated issues that need
 | |
|     to be addressed by such a system:
 | |
|   <ol>
 | |
|     <li><p>SQLite is required to <b>implement atomic and durable
 | |
|         transactions</b> (the 'A' and 'D' from the ACID acronym), even if an
 | |
|         application, operating system or power failure occurs midway through or
 | |
|         shortly after updating a database file.
 | |
|         <p>To implement atomic transactions in the face of potential 
 | |
|         application, operating system or power failures, database writers write
 | |
|         a copy of those portions of the database file that they are going to
 | |
|         modify into a second file, the <i>journal file</i>, before writing
 | |
|         to the database file. If a failure does occur while modifying the 
 | |
|         database file, SQLite can reconstruct the original database 
 | |
|         (before the modifications were attempted) based on the contents of 
 | |
|         the <i>journal file</i>.
 | |
|     <li><p>SQLite is required to <b>implement isolated transactions</b> (the 'I'
 | |
|         from the ACID acronym). 
 | |
|         <p>This is done by using the file locking facilities provided by the
 | |
|         VFS adaptor to serialize writers (write transactions) and preventing
 | |
|         readers (read transactions) from accessing database files while writers
 | |
|         are midway through updating them.
 | |
|     <li><p>For performance reasons, it is advantageous to <b>minimize the 
 | |
|         quantity of data read and written</b> to and from the file-system.
 | |
|         <p>As one might expect, the amount of data read from the database 
 | |
|         file is minimized by caching portions of the database file in main 
 | |
|         memory. Additionally, multiple updates to the database file that
 | |
|         are part of the same <i>write transaction</i> may be cached in
 | |
|         main memory and written to the file together, allowing for
 | |
|         more efficient IO patterns and eliminating the redundant write 
 | |
|         operations that could take place if part of the database file is
 | |
|         modified more than once within a single <i>write transaction</i>.
 | |
|   </ol>
 | |
|   <p class=todo>
 | |
|     System requirement references for the above points.
 | |
|   <p>
 | |
|     This document describes in detail the way that SQLite uses the API 
 | |
|     provided by the VFS adaptor component to solve the problems and implement
 | |
|     the strategies enumerated above. It also specifies the assumptions made
 | |
|     about the properties of the system that the VFS adaptor provides
 | |
|     access to. For example, specific assumptions about the extent of
 | |
|     data corruption that may occur if a power failure occurs while a
 | |
|     database file is being updated are presented in section 
 | |
|     <cite>fs_characteristics</cite>.
 | |
|   <p>
 | |
|     This document does not specify the details of the interface that must
 | |
|     be implemented by the VFS adaptor component, that is left to
 | |
|     <cite>capi_sqlitert_requirements</cite>.
 | |
|   <h2>Relationship to Other Documents</h2>
 | |
|     <p>
 | |
|       Related to C-API requirements:
 | |
|     <ol>
 | |
|       <li>Opening a connection.
 | |
|       <li>Closing a connection.
 | |
|     </ol>
 | |
|     <p>
 | |
|       Related to SQL requirements:
 | |
|     <ol>
 | |
|       <li value=3>Opening a read-only transaction.
 | |
|       <li>Terminating a read-only transaction.
 | |
|       <li>Opening a read-write transaction.
 | |
|       <li>Committing a read-write transaction.
 | |
|       <li>Rolling back a read-write transaction.
 | |
|       <li>Opening a statement transaction.
 | |
|       <li>Committing a statement transaction.
 | |
|       <li>Rolling back a statement transaction.
 | |
|       <li>Committing a multi-file transaction.
 | |
|     </ol>
 | |
|     <p>
 | |
|       Related to file-format requirements:
 | |
|     <ol>
 | |
|       <li value=12>Pinning (reading) a database page.
 | |
|       <li>Unpinning a database page.
 | |
|       <li>Modifying the contents of a database page.
 | |
|       <li>Appending a new page to the database file.
 | |
|       <li>Truncating a page from the end of the database file.
 | |
|     </ol>
 | |
|   <h2>Document Structure</h2>
 | |
|     <p>
 | |
|       Section <cite>vfs_assumptions</cite> of this document describes the
 | |
|       various assumptions made about the system to which the VFS adaptor
 | |
|       component provides access. The basic capabilities and functions 
 | |
|       required from the VFS implementation are presented along with the
 | |
|       description of the VFS interface in 
 | |
|       <cite>capi_sqlitert_requirements</cite>. Section
 | |
|       <cite>vfs_assumptions</cite> complements this by describing in more
 | |
|       detail the assumptions made about VFS implementations on which the
 | |
|       algorithms presented in this document depend. Some of these assumptions
 | |
|       relate to performance issues, but most concern the expected state of
 | |
|       the file-system following a failure that occurs midway through 
 | |
|       modifying a database file.
 | |
|     <p>
 | |
|       Section <cite>database_connections</cite> introduces the concept of
 | |
|       a <i>database connection</i>, a combination of a file-handle and
 | |
|       in-memory cache used to access a database file. It also describes the
 | |
|       VFS operations required when a new <i>database connection</i> is
 | |
|       created (opened), and when one is destroyed (closed).
 | |
|     <p>
 | |
|       Section <cite>reading_data</cite> describes the steps required to
 | |
|       open a <i>read transaction</i> and read data from a database file.
 | |
|     <p>
 | |
|       Section <cite>writing_data</cite> describes the steps required to
 | |
|       open a <i>write transaction </i> and write data to a database file.
 | |
|     <p>
 | |
|       Section <cite>rollback</cite> describes the way in which aborted
 | |
|       <i>write transactions</i> may be rolled back (reverted), either as
 | |
|       a result of an explicit user directive or because an application,
 | |
|       operating system or power failure occurred while SQLite was midway
 | |
|       through updating a database file.
 | |
|     <p>
 | |
|       Section <cite>page_cache_algorithms</cite> describes some of the
 | |
|       algorithms used to determine exactly which portions of the database
 | |
|       file are cached by a <i>page cache</i>, and the effect that they
 | |
|       have on the quantity and nature of the required VFS operations.
 | |
|       It may at first seem odd to include the <i>page cache</i>, which is 
 | |
|       primarily an implementation detail, in this document. However, it is
 | |
|       necessary to acknowledge and describe the <i>page cache</i> in order to
 | |
|       provide a more complete explanation of the nature and quantity of IO
 | |
|       performed by SQLite. 
 | |
|   <h2>Glossary</h2>
 | |
|     <p class=todo>
 | |
|       After this document is ready, make the vocabulary consistent and
 | |
|       then add a glossary here.
 | |
| <h1 id=vfs_assumptions>VFS Adaptor Related Assumptions</h1>
 | |
|   <p>
 | |
|     This section documents those assumptions made about the system that
 | |
|     the VFS adaptor provides access to. The assumptions noted in section
 | |
|     <cite>fs_characteristics</cite> are particularly important. If these
 | |
|     assumptions are not true, then a power or operating system failure
 | |
|     may cause SQLite databases to become corrupted.
 | |
|   <h2 id=fs_performance>Performance Related Assumptions</h2>
 | |
|     <p>
 | |
|       SQLite uses the assumptions in this section to try to speed up 
 | |
|       reading from and writing to the database file.
 | |
| <p class=req id=A21010>
 | |
|       It is assumed that writing a series of sequential blocks of data to 
 | |
|       a file in order is faster than writing the same blocks in an arbitrary
 | |
|       order.
 | |
| </p>  <h2 id=fs_characteristics>System Failure Related Assumptions</h2>
 | |
|     <p>
 | |
|       In the event of an operating system or power failure, the various 
 | |
|       combinations of file-system software and storage hardware available
 | |
|       provide varying levels of guarantee as to the integrity of the data
 | |
|       written to the file system just before or during the failure. The exact
 | |
|       combination of IO operations that SQLite is required to perform in 
 | |
|       order to safely modify a database file depend on the exact 
 | |
|       characteristics of the target platform.
 | |
|     <p>
 | |
|       This section describes the assumptions that SQLite makes about the
 | |
|       content of a file-system following a power or system failure. In
 | |
|       other words, it describes the extent of file and file-system corruption
 | |
|       that such an event may cause.
 | |
|     <p>
 | |
|       SQLite queries an implementation for file-system characteristics
 | |
|       using the xDeviceCharacteristics() and xSectorSize() methods of the
 | |
|       database file file-handle. These two methods are only ever called
 | |
|       on file-handles open on database files. They are not called for 
 | |
|       <i>journal files</i>, <i>master-journal files</i> or 
 | |
|       <i>temporary database files</i>.
 | |
|     <p>
 | |
|       The file-system <i>sector size</i> value determined by calling the
 | |
|       xSectorSize() method is a power of 2 value between 512 and 32768, 
 | |
|       inclusive <span class=todo>reference to exactly how this is
 | |
|       determined</span>. SQLite assumes that the underlying storage
 | |
|       device stores data in blocks of <i>sector-size</i> bytes each, 
 | |
|       sectors. It is also assumed that each aligned block of 
 | |
|       <i>sector-size</i> bytes of each file is stored in a single device
 | |
|       sector. If the file is not an exact multiple of <i>sector-size</i>
 | |
|       bytes in size, then the final device sector is partially empty.
 | |
|     <p>
 | |
|       Normally, SQLite assumes that if a power failure occurs while 
 | |
|       updating any portion of a sector then the contents of the entire 
 | |
|       device sector is suspect following recovery. After writing to
 | |
|       any part of a sector within a file, it is assumed that the modified
 | |
|       sector contents are held in a volatile buffer somewhere within
 | |
|       the system (main memory, disk cache etc.). SQLite does not assume
 | |
|       that the updated data has reached the persistent storage media, until
 | |
|       after it has successfully <i>synced</i> the corresponding file by
 | |
|       invoking the VFS xSync() method. <i>Syncing</i> a file causes all
 | |
|       modifications to the file up until that point to be committed to
 | |
|       persistent storage.
 | |
|     <p>
 | |
|       Based on the above, SQLite is designed around a model of the
 | |
|       file-system whereby any sector of a file written to is considered to be
 | |
|       in a transient state until after the file has been successfully 
 | |
|       <i>synced</i>. Should a power or system failure occur while a sector 
 | |
|       is in a transient state, it is impossible to predict its contents
 | |
|       following recovery. It may be written correctly, not written at all,
 | |
|       overwritten with random data, or any combination thereof.
 | |
|     <p>
 | |
|       For example, if the <i>sector-size</i> of a given file-system is
 | |
|       2048 bytes, and SQLite opens a file and writes a 1024 byte block
 | |
|       of data to offset 3072 of the file, then according to the model 
 | |
|       the second sector of the file is in the transient state. If a 
 | |
|       power failure or operating system crash occurs before or during
 | |
|       the next call to xSync() on the file handle, then following system
 | |
|       recovery SQLite assumes that all file data between byte offsets 2048 
 | |
|       and 4095, inclusive, is invalid. It also assumes that since the first
 | |
|       sector of the file, containing the data from byte offset 0 to 2047 
 | |
|       inclusive, is valid, since it was not in a transient state when the 
 | |
|       crash occurred.
 | |
|     <p>
 | |
|       Assuming that any and all sectors in the transient state may be 
 | |
|       corrupted following a power or system failure is a very pessimistic
 | |
|       approach. Some modern systems provide more sophisticated guarantees
 | |
|       than this. SQLite allows the VFS implementation to specify at runtime
 | |
|       that the current platform supports zero or more of the following 
 | |
|       properties:
 | |
|     <ul>
 | |
|       <li><p>The <b>safe-append</b> property. If a system supports the
 | |
|           <i>safe-append</i> property, it means that when a file is extended
 | |
|           the new data is written to the persistent media before the size
 | |
|           of the file itself is updated. This guarantees that if a failure
 | |
|           occurs after a file has been extended, following recovery 
 | |
|           the write operations that extended the file will appear to have 
 | |
|           succeeded or not occurred at all. It is not possible for invalid
 | |
|           or garbage data to appear in the extended region of the file.
 | |
|       <li><p>The <b>atomic-write</b> property. A system that supports this
 | |
|           property also specifies the size or sizes of the blocks that it
 | |
|           is capable of writing. Valid sizes are powers of two greater than
 | |
|           512. If a write operation modifies a block of <i>n</i> bytes,
 | |
|           where <i>n</i> is one of the block sizes for which <i>atomic-write</i>
 | |
|           is supported, then it is impossible for an aligned write of <i>n</i>
 | |
|           bytes to cause data corruption. If a failure occurs after such 
 | |
|           a write operation and before the applicable file handle is
 | |
|           <i>synced</i>, then following recovery it will appear as if the
 | |
|           write operation succeeded or did not take place at all. It is not
 | |
|           possible that only part of the data specified by the write operation
 | |
|           was written to persistent media, nor is it possible for any content
 | |
|           of the sectors spanned by the write operation to be replaced with
 | |
|           garbage data, as it is normally assumed to be.
 | |
|       <li><p>The <b>sequential-write</b> property. A system that supports the
 | |
|           <i>sequential-write</i> property guarantees that the various write
 | |
|           operations on files within the same file-system are written to the
 | |
|           persistent media in the same order that they are performed by the
 | |
|           application and that each operation is concluded before the next
 | |
|           is begun. If a system supports the <i>sequential-write</i> 
 | |
|           property, then the model used to determine the possible states of
 | |
|           the file-system following a failure is different. 
 | |
|           <p>If a system supports <i>sequential-write</i> it is assumed that 
 | |
|           <i>syncing</i> any file within the file system flushes all write
 | |
|           operations on all files (not just the <i>synced</i> file) to
 | |
|           the persistent media. If a failure does occur, it is not known
 | |
|           whether or not any of the write operations performed by SQLite 
 | |
|           since the last time a file was <i>synced</i>. SQLite is able to
 | |
|           assume that if the write operations of unknown status are arranged
 | |
|           in the order that they occurred:
 | |
|           <ol> 
 | |
|             <li> the first <i>n</i> operations will have been executed 
 | |
|                  successfully,
 | |
|             <li> the next operation puts all device sectors that it modifies
 | |
|                  into the transient state, so that following recovery each
 | |
|                  sector may be partially written, completely written, not
 | |
|                  written at all or populated with garbage data,
 | |
|             <li> the remaining operations will not have had any effect on
 | |
|                  the contents of the file-system.
 | |
|           </ol> 
 | |
|     </ul>
 | |
|     <h3 id=fs_assumption_details>Failure Related Assumption Details</h3>
 | |
|     <p>
 | |
|       This section describes how the assumptions presented in the parent
 | |
|       section apply to the individual API functions and operations provided 
 | |
|       by the VFS to SQLite for the purposes of modifying the contents of the
 | |
|       file-system.
 | |
|     <p>
 | |
|       SQLite manipulates the contents of the file-system using a combination
 | |
|       of the following four types of operation:
 | |
|     <ul>
 | |
|       <li> <b>Create file</b> operations. SQLite may create new files
 | |
|            within the file-system by invoking the xOpen() method of
 | |
|            the sqlite3_io_methods object.
 | |
|       <li> <b>Delete file</b> operations. SQLite may remove files from the
 | |
|            file system by calling the xDelete() method of the
 | |
|            sqlite3_io_methods object.
 | |
|       <li> <b>Truncate file</b> operations. SQLite may truncate existing 
 | |
|            files by invoking the xTruncate() method of the sqlite3_file 
 | |
|            object.
 | |
|       <li> <b>Write file</b> operations. SQLite may modify the contents
 | |
|            and increase the size of a file by files by invoking the xWrite() 
 | |
|            method of the sqlite3_file object.
 | |
|     </ul>
 | |
|     <p>
 | |
|       Additionally, all VFS implementations are required to provide the
 | |
|       <i>sync file</i> operation, accessed via the xSync() method of the
 | |
|       sqlite3_file object, used to flush create, write and truncate operations
 | |
|       on a file to the persistent storage medium.
 | |
|     <p>
 | |
|       The formalized assumptions in this section refer to <i>system failure</i>
 | |
|       events.  In this context, this should be interpreted as any failure that
 | |
|       causes the system to stop operating. For example a power failure or
 | |
|       operating system crash.
 | |
|     <p>
 | |
|       SQLite does not assume that a <b>create file</b> operation has actually
 | |
|       modified the file-system records within persistent storage until
 | |
|       after the file has been successfully <i>synced</i>.
 | |
| <p class=req id=A21001>
 | |
|       If a system failure occurs during or after a "create file"
 | |
|       operation, but before the created file has been <i>synced</i>, then 
 | |
|       SQLite assumes that it is possible that the created file may not
 | |
|       exist following system recovery.
 | |
| </p>    <p>
 | |
|       Of course, it is also possible that it does exist following system
 | |
|       recovery.
 | |
| <p class=req id=A21002>
 | |
|       If a "create file" operation is executed by SQLite, and then the
 | |
|       created file <i>synced</i>, then SQLite assumes that the file-system
 | |
|       modifications corresponding to the "create file" operation have been
 | |
|       committed to persistent media. It is assumed that if a system
 | |
|       failure occurs any time after the file has been successfully 
 | |
|       <i>synced</i>, then the file is guaranteed to appear in the file-system
 | |
|       following system recovery.
 | |
| </p>    <p>
 | |
|       A <b>delete file</b> operation (invoked by a call to the VFS xDelete() 
 | |
|       method) is assumed to be an atomic and durable operation. 
 | |
|     </p>
 | |
| <p class=req id=A21003>
 | |
|       If a system failure occurs at any time after a "delete file" 
 | |
|       operation (call to the VFS xDelete() method) returns successfully, it is
 | |
|       assumed that the file-system will not contain the deleted file following
 | |
|       system recovery.
 | |
| </p><p class=req id=A21004>
 | |
|       If a system failure occurs during a "delete file" operation,
 | |
|       it is assumed that following system recovery the file-system will 
 | |
|       either contain the file being deleted in the state it was in before
 | |
|       the operation was attempted, or not contain the file at all. It is 
 | |
|       assumed that it is not possible for the file to have become corrupted
 | |
|       purely as a result of a failure occurring during a "delete file" 
 | |
|       operation.
 | |
| </p>    <p>
 | |
|       The effects of a <b>truncate file</b> operation are not assumed to
 | |
|       be made persistent until after the corresponding file has been
 | |
|       <i>synced</i>.
 | |
| <p class=req id=A21005>
 | |
|       If a system failure occurs during or after a "truncate file"
 | |
|       operation, but before the truncated file has been <i>synced</i>, then 
 | |
|       SQLite assumes that the size of the truncated file is either as large
 | |
|       or larger than the size that it was to be truncated to.
 | |
| </p><p class=req id=A21006>
 | |
|       If a system failure occurs during or after a "truncate file"
 | |
|       operation, but before the truncated file has been <i>synced</i>, then 
 | |
|       it is assumed that the contents of the file up to the size that the
 | |
|       file was to be truncated to are not corrupted.
 | |
| </p>    <p>
 | |
|       The above two assumptions may be interpreted to mean that if a 
 | |
|       system failure occurs after file truncation but before the truncated
 | |
|       file is <i>synced</i>, the contents of the file following the point
 | |
|       at which it was to be truncated may not be trusted. They may contain 
 | |
|       the original file data, or may contain garbage.
 | |
| <p class=req id=A21007>
 | |
|       If a "truncate file" operation is executed by SQLite, and then the
 | |
|       truncated file <i>synced</i>, then SQLite assumes that the file-system
 | |
|       modifications corresponding to the "truncate file" operation have been
 | |
|       committed to persistent media. It is assumed that if a system
 | |
|       failure occurs any time after the file has been successfully 
 | |
|       <i>synced</i>, then the effects of the file truncation are guaranteed
 | |
|       to appear in the file system following recovery.
 | |
| </p>    <p>
 | |
|       A <b>write file</b> operation modifies the contents of an existing file
 | |
|       within the file-system. It may also increase the size of the file.
 | |
|       The effects of a <i>write file</i> operation are not assumed to
 | |
|       be made persistent until after the corresponding file has been
 | |
|       <i>synced</i>.
 | |
| <p class=req id=A21008>
 | |
|       If a system failure occurs during or after a "write file"
 | |
|       operation, but before the corresponding file has been <i>synced</i>, 
 | |
|       then it is assumed that the content of all sectors spanned by the
 | |
|       <i>write file</i> operation are untrustworthy following system 
 | |
|       recovery. This includes regions of the sectors that were not
 | |
|       actually modified by the write file operation.
 | |
| </p><p class=req id=A21011>
 | |
|       If a system failure occurs on a system that supports the 
 | |
|       <i>atomic-write</i> property for blocks of size <i>N</i> bytes
 | |
|       following an aligned write of <i>N</i> 
 | |
|       bytes to a file but before the file has been successfully <i>synced</i>,
 | |
|       then is assumed following recovery that all sectors spanned by the
 | |
|       write operation were correctly updated, or that none of the sectors were
 | |
|       modified at all.
 | |
| </p><p class=req id=A21012>
 | |
|       If a system failure occurs on a system that supports the 
 | |
|       <i>safe-append</i> following a write operation that appends data
 | |
|       to the end of the file without modifying any of the existing file 
 | |
|       content but before the file has been successfully <i>synced</i>,
 | |
|       then is assumed following recovery that either the data was
 | |
|       correctly appended to the file, or that the file size remains 
 | |
|       unchanged. It is assumed that it is impossible that the file be
 | |
|       extended but populated with incorrect data.
 | |
| </p><p class=req id=A21013>
 | |
|       Following a system recovery, if a device sector is deemed to be
 | |
|       untrustworthy as defined by A21008 and neither A21011 or A21012 
 | |
|       apply to the range of bytes written, then no assumption can be
 | |
|       made about the content of the sector following recovery. It is
 | |
|       assumed that it is possible for such a sector to be written 
 | |
|       correctly, not written at all, populated with garbage data or any
 | |
|       combination thereof.
 | |
| </p><p class=req id=A21009>
 | |
|       If a system failure occurs during or after a "write file"
 | |
|       operation that causes the file to grow, but before the corresponding 
 | |
|       file has been <i>synced</i>, then it is assumed that the size of 
 | |
|       the file following recovery is as large or larger than it was when
 | |
|       it was most recently <i>synced</i>.
 | |
| </p>    <p>
 | |
|       If a system supports the <i>sequential-write</i> property, then further
 | |
|       assumptions may be made with respect to the state of the file-system
 | |
|       following recovery from a <i>system failure</i>. Specifically, it is
 | |
|       assumed that create, truncate, delete and write file operations are
 | |
|       applied to the persistent representation in the same order as they 
 | |
|       are performed by SQLite. Furthermore, it is assumed that the 
 | |
|       file-system waits until one operation is safely written to the 
 | |
|       persistent media before the next is attempted, just as if the relevant
 | |
|       file were <i>synced</i> following each operation.
 | |
| <p class=req id=A21014>
 | |
|       If a system failure occurs on a system that supports the
 | |
|       <i>sequential-write</i> property, then it is assumed that all 
 | |
|       operations completed before the last time any file was <i>synced</i> 
 | |
|       have been successfully committed to persistent media.
 | |
| </p><p class=req id=A21015>
 | |
|       If a system failure occurs on a system that supports the
 | |
|       <i>sequential-write</i> property, then it is assumed that the set
 | |
|       of possible states that the file-system may be in following recovery
 | |
|       is the same as if each of the write operations performed since the most
 | |
|       recent time a file was <i>synced</i> was itself followed by a <i>sync
 | |
|       file</i> operation, and that the system failure may have occurred during
 | |
|       any of the write or <i>sync file</i> operations.
 | |
| </p><!--
 | |
|     <p>
 | |
|       The return value of the xSectorSize() method, the <i>sector-size</i>, is
 | |
|       expected by SQLite to be a power of 2 value greater than or equal to 512.
 | |
|     <p class=todo> 
 | |
|       What does it do if this is not the case? If the sector size is less
 | |
|       than 512 then 512 is used instead. How about a non power-of-two value?
 | |
|       UPDATE: How this situation is handled should be described in the API
 | |
|       requirements. Here we can just refer to the other document.
 | |
|     <p>
 | |
|       SQLite assumes that files are stored and written to within the
 | |
|       file-system as a collection of blocks (hereafter sectors) of data, each
 | |
|       <i>sector-size</i> bytes in size. This model is used to derive
 | |
|       the following assumptions related to the expected state of the
 | |
|       file-system following a power failure or operating system crash.
 | |
|     <ul>
 | |
|       <li>
 | |
|           After part or all of a file sector has been modified
 | |
|           using the xWrite() method of an open file-handle, the sector
 | |
|           is said to be in a transient state, where the operating system
 | |
|           makes no guarantees about the actual content of the sector on the
 | |
|           persistent media. The sector remains in the transient state until
 | |
|           the next successful call to xSync() on the same file-handle 
 | |
|           returns. If a power failure or operating system crash occurs, then
 | |
|           part or all of all sectors in the transient state when the crash
 | |
|           occurred may contain invalid data following system recovery.
 | |
|       <li>
 | |
|           Following a power failure or operating system crash, the content
 | |
|           of all sectors that were not in a transient state when the crash
 | |
|           occurred may be trusted.
 | |
|     </ul>
 | |
|     <p class=todo>
 | |
|       What do we assume about the other three file-system write 
 | |
|       operations - xTruncate(), xDelete() and "create file"?
 | |
|     <p>
 | |
|       The xDeviceCharacteristics() method returns a set of flags, 
 | |
|       indicating which of the following properties (if any) the 
 | |
|       file-system provides:
 | |
|     <ul>
 | |
|       <li>The <b><i>sequential IO</i></b> property. If a file-system has this 
 | |
|           property, then in the event of a crash at most a single sector
 | |
|           may contain invalid data. The file-system guarantees
 | |
|       <li>The <b><i>safe-append</i></b> property.
 | |
|       <li>The <b><i>atomic write</i></b> property.
 | |
|     </ul>
 | |
|     <p class=todo>
 | |
|       Write an explanation as to how the file-system properties influence
 | |
|       the model used to predict file damage after a catastrophe.
 | |
|  -->
 | |
| <h1 id=database_connections>Database Connections</h1>
 | |
|   <p>
 | |
|     Within this document, the term <i>database connection</i> has a slightly
 | |
|     different meaning from that which one might assume. The handles returned
 | |
|     by the <code>sqlite3_open()</code> and <code>sqlite3_open16()</code>
 | |
|     APIs (<span class=todo>reference</span>) are referred to as <i>database
 | |
|     handles</i>.  A <i>database connection</i> is a connection to a single
 | |
|     database file using a single file-handle, which is held open for the
 | |
|     lifetime of the connection. Using the SQL ATTACH syntax, multiple
 | |
|     <i>database connections</i> may be accessed via a single <i>database
 | |
|     handle</i>. Or, using SQLite's <i>shared-cache mode</i> feature, multiple
 | |
|     <i>database handles</i> may access a single <i>database connection</i>.
 | |
|   <p>
 | |
|     Usually, a new <i>database connection</i> is opened whenever the user opens
 | |
|     new <i>database handle</i> on a real database file (not an in-memory
 | |
|     database) or when a database file is attached to an existing <i>database
 | |
|     connection</i> using the SQL ATTACH syntax. However if the <i>shared-cache
 | |
|     mode</i> feature is enabled, then the database file may be accessed through
 | |
|     an existing <i>database connection</i>. For more information on
 | |
|     <i>shared-cache mode</i>, refer to <span class=todo>Reference</span>.  The
 | |
|     various IO operations required to open a new connection are detailed in
 | |
|     section <cite>open_new_connection</cite> of this document.
 | |
|   <p>
 | |
|     Similarly, a <i>database connection</i> is usually closed when the user
 | |
|     closes a <i>database handle</i> that is open on a real database file or
 | |
|     has had one or more real database files attached to it using the ATTACH
 | |
|     mechanism, or when a real database file is detached from a <i>database
 | |
|     connection</i> using the DETACH syntax. Again, the exception is if
 | |
|     <i>shared-cache mode</i> is enabled. In this case, a <i>database
 | |
|     connection</i> is not closed until its number of users reaches zero.
 | |
|     The IO related steps required to close a <i>database connection</i> are
 | |
|     described in section <cite>closing_database_connection</cite>.
 | |
|   <p class=todo>
 | |
|     After sections 4 and 5 are finished, come back here and see if we can add a
 | |
|     list of state items associated with each database connection to make things
 | |
|     easier to understand. i.e each database connection has a file handle, a set
 | |
|     of entries in the page cache, an expected page size etc.
 | |
|   <h2 id=open_new_connection>Opening a New Connection</h2>
 | |
|     <p>
 | |
|       This section describes the VFS operations that take place when a
 | |
|       new database connection is created. 
 | |
|     <p>
 | |
|       Opening a new database connection is a two-step process:
 | |
|     <ol>
 | |
|       <li> A file-handle is opened on the database file.
 | |
|       <li> If step 1 was successful, an attempt is made to read the 
 | |
|            <i>database file header</i> from the database file using the 
 | |
|            new file-handle.
 | |
|     </ol>
 | |
|     <p>
 | |
|       In step 2 of the procedure above, the database file is not locked
 | |
|       before it is read from. This is the only exception to the locking 
 | |
|       rules described in section <cite>reading_data</cite>.
 | |
|     <p>
 | |
|       The reason for attempting to read the <i>database file header</i>
 | |
|       is to determine the <i>page-size</i> used by the database file. 
 | |
|       Because it is not possible to be certain as to the <i>page-size</i> 
 | |
|       without holding at least a <i>shared lock</i> on the database file
 | |
|       (because some other <i>database connection</i> might have changed it
 | |
|       since the <i>database file header</i> was read), the value read from the
 | |
|       <i>database file header</i> is known as the <i>expected page size</i>. 
 | |
| <p class=req id=H35060>
 | |
| When a new <i>database connection</i> is required, SQLite shall attempt
 | |
| to open a file-handle on the database file. If the attempt fails, then
 | |
| no new <i>database connection</i> is created and an error returned.
 | |
| <p class=req id=H35070>
 | |
| When a new <i>database connection</i> is required, after opening the
 | |
| new file-handle, SQLite shall attempt to read the first 100 bytes
 | |
| of the database file. If the attempt fails for any other reason than
 | |
| that the opened file is less than 100 bytes in size, then
 | |
| the file-handle is closed, no new <i>database connection</i> is created
 | |
| and an error returned instead.
 | |
| <p class=req id=H35080>
 | |
| If the <i>database file header</i> is successfully read from a newly
 | |
| opened database file, the connections <i>expected page-size</i> shall
 | |
| be set to the value stored in the <i>page-size field</i> of the
 | |
| database header.
 | |
| <p class=req id=H35090>
 | |
| If the <i>database file header</i> cannot be read from a newly opened
 | |
| database file (because the file is less than 100 bytes in size), the
 | |
| connections <i>expected page-size</i> shall be set to the compile time
 | |
| value of the SQLITE_DEFAULT_PAGESIZE option.
 | |
|   <h2 id=closing_database_connection>Closing a Connection</h2>
 | |
|     <p>
 | |
|       This section describes the VFS operations that take place when an
 | |
|       existing database connection is closed (destroyed). 
 | |
|     <p>
 | |
|       Closing a database connection is a simple matter. The open VFS 
 | |
|       file-handle is closed and in-memory <i>page cache</i> related resources
 | |
|       are released. 
 | |
| <p class=req id=H35400>
 | |
| When a <i>database connection</i> is closed, SQLite shall close the
 | |
| associated file handle at the VFS level.
 | |
| <p class=req id=H35430>
 | |
| When a <i>database connection</i> is closed, all associated <i>page
 | |
| cache</i> entries shall be discarded.
 | |
| <h1 id=page_cache>The Page Cache</h1>
 | |
|   <p>
 | |
|     The contents of an SQLite database file are formatted as a set of 
 | |
|     fixed size pages. See <cite>ff_sqlitert_requirements</cite> for a
 | |
|     complete description of the format used. The <i>page size</i> used
 | |
|     for a particular database is stored as part of the database file
 | |
|     header at a well-known offset within the first 100 bytes of the 
 | |
|     file. Almost all read and write operations performed by SQLite on
 | |
|     database files are done on blocks of data <i>page-size</i> bytes
 | |
|     in size. 
 | |
|   <p>
 | |
|     All SQLite database connections running within a single process share
 | |
|     a single <i>page cache</i>. The <i>page cache</i> caches data read from
 | |
|     database files in main-memory on a per-page basis. When SQLite requires
 | |
|     data from a database file to satisfy a database query, it checks the <i>
 | |
|     page cache</i> for usable cached versions of the required database
 | |
|     pages before loading it from the database file. If no usable cache
 | |
|     entry can be found and the database page data is loaded from the database
 | |
|     file, it is cached in the <i>page cache</i> in case the same data is 
 | |
|     needed again later. Because reading from the database file is assumed to
 | |
|     be an order of magnitude slower than reading from main-memory, caching
 | |
|     database page content in the <i>page cache</i> to minimize the number
 | |
|     of read operations performed on the database file is a significant
 | |
|     performance enhancement.
 | |
|   <p>
 | |
|     The <i>page cache</i> is also used to buffer database write operations.
 | |
|     When SQLite is required to modify one of more of the <i>database pages</i>
 | |
|     that make up a database file, it first modifies the cached version of
 | |
|     the page in the <i>page cache</i>. At that point the page is considered
 | |
|     a "dirty" page. At some point later on, the new content of the "dirty"
 | |
|     page is copied from the <i>page cache</i> into the database file via
 | |
|     the VFS interface. Buffering writes in the <i>page cache</i> can reduce
 | |
|     the number of write operations required on the database file (in cases
 | |
|     where the same page is updated twice) and allows optimizations based
 | |
|     on the assumptions outlined in section <cite>fs_performance</cite>.
 | |
|   <p>
 | |
|     Database read and write operations, and the way in which they interact
 | |
|     with and use the <i>page cache</i>, are described in detail in sections
 | |
|     <cite>reading_data</cite> and <cite>writing_data</cite> of this document,
 | |
|     respectively.
 | |
|   <p>
 | |
|     At any one time, the <i>page cache</i> contains zero or more <i>page cache
 | |
|     entries</i>, each of which has the following data associated with it:
 | |
|   <ul>
 | |
|     <li><p>
 | |
|       A reference to <b>the associated <i>database connection</i></b>. Each
 | |
|       entry in the <i>page cache</i> is associated with a single <i>database
 | |
|       connection</i>; the <i>database connection</i> that created the entry. 
 | |
|       A <i>page cache entry</i> is only ever used by the <i>database 
 | |
|       connection</i> that created it. Page cache entries are not shared between
 | |
|       <i>database connections</i>.
 | |
|     <li><p>
 | |
|       The <b><i>page number</i></b> of the cached page. Pages are sequentially
 | |
|       numbered within a database file starting from page 1 (page 1 begins at
 | |
|       byte offset 0). Refer to <cite>ff_sqlitert_requirements</cite> for
 | |
|       details.
 | |
|     <li><p>
 | |
|       The <b>cached data</b>; a blob of data <i>page-size</i> bytes in size.
 | |
|   </ul>
 | |
|   <p>
 | |
|     The first two elements in the list above, the associated <i>database
 | |
|     connection</i> and the <i>page number</i>, uniquely identify the
 | |
|     <i>page cache entry</i>. At no time may the <i>page cache</i> contain two
 | |
|     entries for which both the <i>database connection</i> and <i>page 
 | |
|     number</i> are identical. Or, put another way, a single <i>database
 | |
|     connection</i> never caches more than one copy of a database page
 | |
|     within the <i>page cache</i>.
 | |
|   <p>
 | |
|     At any one time, each <i>page cache entry</i> may be said to be a <i>clean
 | |
|     page</i>, a <i>non-writable dirty page</i> or a <i>writable dirty page</i>,
 | |
|     according to the following definitions:
 | |
|   <ul>
 | |
|     <li> <p>A <b><i>clean page</i></b> is one for which the cached data 
 | |
|          currently matches the contents of the corresponding page of 
 | |
|          the database file. The page has not been modified since it was
 | |
|          loaded from the file.
 | |
|     <li> <p>A <b><i>dirty page</i></b> is a <i>page cache entry</i> for which
 | |
|          the cached data has been modified since it was loaded from the database
 | |
|          file, and so no longer matches the current contents of the
 | |
|          corresponding database file page. A <i>dirty page</i> is one that is
 | |
|          currently buffering a modification made to the database file as part
 | |
|          of a <i>write transaction</i>. 
 | |
|     <li> <p>Within this document, the term <b><i>non-writable dirty
 | |
|          page</i></b> is used specifically to refer to a <i>page cache
 | |
|          entry</i> with modified content for which it is not yet safe to update
 | |
|          the database file with.  It is not safe to update a database file with
 | |
|          a buffered write if a power or system failure that occurs during or
 | |
|          soon after the update may cause the database to become corrupt
 | |
|          following system recovery, according to the assumptions made in
 | |
|          section <cite>fs_assumption_details</cite>.
 | |
|     <li> <p>A <i>dirty page</i> for which it would be safe to update the
 | |
|          corresponding database file page with the modified contents of 
 | |
|          without risking database corruption is known as a 
 | |
|          <b><i>writable dirty page</i></b>.
 | |
|   </ul>
 | |
|   <p>
 | |
|     The exact logic used to determine if a <i>page cache entry</i> with
 | |
|     modified content is a <i>dirty page</i> or <i>writable page</i> is
 | |
|     presented in section <cite>page_cache_algorithms</cite>.
 | |
|   <p>
 | |
|     Because main-memory is a limited resource, the <i>page cache</i> cannot
 | |
|     be allowed to grow indefinitely. As a result, unless all database files
 | |
|     opened by database connections within the process are quite small,
 | |
|     sometimes data must be discarded from the <i>page cache</i>. In practice
 | |
|     this means <i>page cache entries</i> must be purged to make room
 | |
|     for new ones. If a <i>page cache entry</i> being removed from the <i>page
 | |
|     cache</i> to free main-memory is a <i>dirty page</i>, then its contents
 | |
|     must be saved into the database file before it can be discarded without
 | |
|     data loss. The following two sub-sections describe the algorithms used by
 | |
|     the <i>page cache</i> to determine exactly when existing <i>page cache
 | |
|     entries</i> are purged (discarded).
 | |
|   <h2>Page Cache Configuration</h2>
 | |
|     <p class=todo>
 | |
|       Describe the parameters set to configure the page cache limits.
 | |
|   <h2 id=page_cache_algorithms>Page Cache Algorithms</h2>
 | |
|     <p class=todo>
 | |
|       Requirements describing the way in which the configuration parameters
 | |
|       are used. About LRU etc.
 | |
| <h1 id=reading_data>Reading Data</h1>
 | |
|   <p>
 | |
|     In order to return data from the database to the user, for example as
 | |
|     the results of a SELECT query, SQLite must at some point read data
 | |
|     from the database file. Usually, data is read from the database file in 
 | |
|     aligned blocks of <i>page-size</i> bytes. The exception is when the
 | |
|     database file header fields are being inspected, before the
 | |
|     <i>page-size</i> used by the database can be known.
 | |
|   <p>
 | |
|     With two exceptions, a <i>database connection</i> must have an open 
 | |
|     transaction (either a <i>read-only transaction</i> or a 
 | |
|     <i>read/write transaction</i>) on the database before data may be 
 | |
|     read from the database file. 
 | |
|   <p>
 | |
|     The two exceptions are:
 | |
|   <ul>
 | |
|     <li> When an attempt is made to read the 100 byte <i>database file
 | |
|          header</i> immediately after opening the <i>database connection</i>
 | |
|          (see section <cite>open_new_connection</cite>). When this occurs
 | |
|          no lock is held on the database file.
 | |
|     <li> Data read while in the process of opening a read-only transaction
 | |
|          (see section <cite>open_read_only_trans</cite>). These read 
 | |
|          operations occur after a <i>shared lock</i> is held on the database
 | |
|          file.
 | |
|   </ul>
 | |
|   <p>
 | |
|     Once a transaction has been opened, reading data from a database 
 | |
|     connection is a simple operation. Using the xRead() method of the 
 | |
|     file-handle open on the database file, the required database file 
 | |
|     pages are read one at a time. SQLite never reads partial pages and
 | |
|     always uses a single call to xRead() for each required page. 
 | |
|    <p>
 | |
|     After reading the data for a database page, SQLite stores the raw
 | |
|     page of data in the <i>page cache</i>. Each time a page of data is 
 | |
|     required by the upper layers, the <i>page cache</i> is queried
 | |
|     to see if it contains a copy of the required page stored by
 | |
|     the current <i>database connection</i>. If such an entry can be
 | |
|     found, then the required data is read from the <i>page cache</i> instead
 | |
|     of the database file. Only a connection with an open transaction
 | |
|     transaction (either a <i>read-only transaction</i> or a 
 | |
|     <i>read/write transaction</i>) on the database may read data from the
 | |
|     <i>page cache</i>. In this sense reading from the <i>page cache</i> is no
 | |
|     different to reading from the <i>database file</i>.
 | |
|    <p>
 | |
|     Refer to section <cite>page_cache_algorithms</cite> for a description 
 | |
|     of exactly how and for how long page data is stored in the 
 | |
|     <i>page cache</i>.
 | |
| <p class=req id=H35010>
 | |
| Except for the read operation required by H35070 and those reads made
 | |
| as part of opening a read-only transaction, SQLite shall ensure that
 | |
| a <i>database connection</i> has an open read-only or read/write
 | |
| transaction when any data is read from the <i>database file</i>.
 | |
| <p class=req id=H35020>
 | |
| Aside from those read operations described by H35070 and H21XXX, SQLite
 | |
| shall read data from the database file in aligned blocks of
 | |
| <i>page-size</i> bytes, where <i>page-size</i> is the database page size
 | |
| used by the database file.
 | |
| <p class=req id=H35420>
 | |
| SQLite shall ensure that a <i>database connection</i> has an open
 | |
| read-only or read/write transaction before using data stored in the <i>page
 | |
| cache</i> to satisfy user queries.
 | |
|   <h2 id=open_read_only_trans>Opening a Read-Only Transaction</h2>
 | |
|     <p>
 | |
|       Before data may be read from a <i>database file</i> or queried from
 | |
|       the <i>page cache</i>, a <i>read-only transaction</i> must be
 | |
|       successfully opened by the associated database connection (this is true
 | |
|       even if the connection will eventually write to the database, as a
 | |
|       <i>read/write transaction</i> may only be opened by upgrading from a
 | |
|       <i>read-only transaction</i>). This section describes the procedure
 | |
|       for opening a <i>read-only transaction</i>.
 | |
|     <p>
 | |
|       The key element of a <i>read-only transaction</i> is that the 
 | |
|       file-handle open on the database file obtains and holds a
 | |
|       <i>shared-lock</i> on the database file. Because a connection requires
 | |
|       an <i>exclusive-lock</i> before it may actually modify the contents
 | |
|       of the database file, and by definition while one connection is holding
 | |
|       a <i>shared-lock</i> no other connection may hold an 
 | |
|       <i>exclusive-lock</i>, holding a <i>shared-lock</i> guarantees that
 | |
|       no other process may modify the database file while the <i>read-only
 | |
|       transaction</i> remains open. This ensures that <i>read-only
 | |
|       transactions</i> are sufficiently isolated from the transactions of
 | |
|       other database users (see section <cite>overview</cite>).
 | |
|     <p>Obtaining the <i>shared lock</i> itself on the database file is quite
 | |
|        simple, SQLite just calls the xLock() method of the database file 
 | |
|        handle. Some of the other processes that take place as part of 
 | |
|        opening the <i>read-only transaction</i> are quite complex. The 
 | |
|        steps that SQLite is required to take to open a <i>read-only
 | |
|        transaction</i>, in the order in which they must occur, is as follows:
 | |
|     <ol>
 | |
|       <li>A <i>shared-lock</i> is obtained on the database file.
 | |
|       <li>The connection checks if a <i>hot journal file</i> exists in the
 | |
|           file-system. If one does, then it is rolled back before continuing.
 | |
|       <li>The connection checks if the data in the <i>page cache</i> may 
 | |
|           still be trusted. If not, all page cache data is discarded.
 | |
|       <li>If the file-size is not zero bytes and the page cache does not
 | |
|           contain valid data for the first page of the database, then the
 | |
|           data for the first page must be read from the database.
 | |
|     </ol>
 | |
|     <p>
 | |
|       Of course, an error may occur while attempting any of the 4 steps
 | |
|       enumerated above. If this happens, then the <i>shared-lock</i> is 
 | |
|       released (if it was obtained) and an error returned to the user. 
 | |
|       Step 2 of the procedure above is described in more detail in section
 | |
|       <cite>hot_journal_detection</cite>. Section <cite>cache_validation</cite>
 | |
|       describes the process identified by step 3 above. Further detail
 | |
|       on step 4 may be found in section <cite>read_page_one</cite>.
 | |
| <p class=req id=H35100>
 | |
| When required to open a <i>read-only transaction</i> using a
 | |
| <i>database connection</i>, SQLite shall first attempt to obtain
 | |
| a <i>shared-lock</i> on the file-handle open on the database file.
 | |
| <p class=req id=H35110>
 | |
| If, while opening a <i>read-only transaction</i>, SQLite fails to obtain
 | |
| the <i>shared-lock</i> on the database file, then the process is
 | |
| abandoned, no transaction is opened and an error returned to the user.
 | |
|     <p>
 | |
|       The most common reason an attempt to obtain a <i>shared-lock</i> may
 | |
|       fail is that some other connection is holding an <i>exclusive</i> or
 | |
|       <i>pending lock</i>. However it may also fail because some other
 | |
|       error (e.g. an IO or comms related error) occurs within the call to the
 | |
|       xLock() method.
 | |
| <p class=req id=H35030>
 | |
| While opening a <i>read-only transaction</i>, after successfully
 | |
| obtaining a <i>shared lock</i> on the database file, SQLite shall
 | |
| attempt to detect and roll back a <i>hot journal file</i> associated
 | |
| with the same database file.
 | |
| <p class=req id=H35120>
 | |
| If, while opening a <i>read-only transaction</i>, SQLite encounters
 | |
| an error while attempting to detect or roll back a <i>hot journal
 | |
| file</i>, then the <i>shared-lock</i> on the database file is released,
 | |
| no transaction is opened and an error returned to the user.
 | |
|     <p>
 | |
|       Section <cite>hot_journal_detection</cite> contains a description of
 | |
|       and requirements governing the detection of a hot-journal file referred
 | |
|       to in the above requirements.
 | |
| <p class=req id=H35040>
 | |
| Assuming no errors have occurred, then after attempting to detect and
 | |
| roll back a <i>hot journal file</i>, if the <i>page cache</i> contains
 | |
| any entries associated with the current <i>database connection</i>,
 | |
| then SQLite shall validate the contents of the <i>page cache</i> by
 | |
| testing the <i>file change counter</i>.  This procedure is known as
 | |
| <i>cache validation</i>.
 | |
|     <p>
 | |
|       The <i>cache validation</i> process is described in detail in section
 | |
|       <cite>cache_validation</cite>
 | |
| <p class=req id=H35050>
 | |
| If the cache validate procedure prescribed by H35040 is required and
 | |
| does not prove that the <i>page cache</i> entries associated with the
 | |
| current <i>database connection</i> are valid, then SQLite shall discard
 | |
| all entries associated with the current <i>database connection</i> from
 | |
| the <i>page cache</i>.
 | |
|     <p>
 | |
|       The numbered list above notes that the data for the first page of the
 | |
|       database file, if it exists and is not already loaded into the <i>page
 | |
|       cache</i>, must be read from the database file before the <i>read-only
 | |
|       transaction</i> may be considered opened. This is handled by 
 | |
|       requirement H35240.
 | |
|   <h3 id=hot_journal_detection>Hot Journal Detection</h3>
 | |
|     <p>
 | |
|       This section describes the procedure that SQLite uses to detect a
 | |
|       <i>hot journal file</i>. If a <i>hot journal file</i> is detected,
 | |
|       this indicates that at some point the process of writing a 
 | |
|       transaction to the database was interrupted and a recovery operation
 | |
|       (<i>hot journal rollback</i>) needs to take place. This section does
 | |
|       not describe the process of <i>hot journal rollback</i> (see section
 | |
|       <cite>hot_journal_rollback</cite>) or the processes by which a
 | |
|       <i>hot journal file</i> may be created (see section
 | |
|       <cite>writing_data</cite>).
 | |
|     <p>
 | |
|       The procedure used to detect a <i>hot-journal file</i> is quite
 | |
|       complex. The following steps take place:
 | |
|       <ol class=spacedlist>
 | |
|         <li>Using the VFS xAccess() method, SQLite queries the file-system 
 | |
|             to see if the journal file associated with the database exists. 
 | |
|             If it does not, then there is no hot-journal file.
 | |
|         <li>By invoking the xCheckReservedLock() method of the file-handle
 | |
|             opened on the database file, SQLite checks if some other connection
 | |
|             holds a <i>reserved lock</i> or greater. If some other connection
 | |
|             does hold a <i>reserved lock</i>, this indicates that the other
 | |
|             connection is midway through a <i>read/write transaction</i> (see
 | |
|             section <cite>writing_data</cite>). In this case the 
 | |
|             <i>journal file</i> is not a <i>hot-journal</i> and must not be 
 | |
|             rolled back.
 | |
|         <li>Using the xFileSize() method of the file-handle opened
 | |
|             on the database file, SQLite checks if the database file is 
 | |
|             0 bytes in size. If it is, the journal file is not considered
 | |
|             to be a <i>hot journal</i> file. Instead of rolling back the
 | |
|             journal file, in this case it is deleted from the file-system
 | |
|             by calling the VFS xDelete() method. <span class=todo>Technically,
 | |
|             there is a race condition here. This step should be moved to
 | |
|             after the exclusive lock is held.</span>
 | |
|         <li>An attempt is made to upgrade to an <i>exclusive lock</i> on the
 | |
|             database file. If the attempt fails, then all locks, including 
 | |
|             the recently obtained <i>shared lock</i> are dropped. The attempt
 | |
|             to open a <i>read-only transaction</i> has failed. This occurs
 | |
|             when some other connection is also attempting to open a 
 | |
|             <i>read-only transaction</i> and the attempt to gain the
 | |
|             <i>exclusive lock</i> fails because the other connection is also
 | |
|             holding a <i>shared lock</i>. It is left to the other connection 
 | |
|             to roll back the <i>hot journal</i>.
 | |
|             <div style="margin-top:0.5em"></div>
 | |
|             It is important that the file-handle lock is upgraded 
 | |
|             directly from <i>shared</i> to <i>exclusive</i> in this case,
 | |
|             instead of first upgrading to <i>reserved</i> or </i>pending</i>
 | |
|             locks as is required when obtaining an <i>exclusive lock</i> to
 | |
|             write to the database file (section <cite>writing_data</cite>).
 | |
|             If SQLite were to first upgrade to a <i>reserved</i> or
 | |
|             <i>pending</i> lock in this scenario, then a second process also
 | |
|             trying to open a <i>read-transaction</i> on the database file might
 | |
|             detect the <i>reserved</i> lock in step 2 of this process, 
 | |
|             conclude that there was no <i>hot journal</i>, and commence
 | |
|             reading data from the <i>database file</i>.
 | |
|         <li>The xAccess() method is invoked again to detect if the journal 
 | |
|             file is still in the file system. If it is, then it is a 
 | |
|             hot-journal file and SQLite tries to roll it back (see section
 | |
|             <cite>rollback</cite>).
 | |
|       </ol>
 | |
|     <p class=todo> Master journal file pointers?
 | |
|     <p>
 | |
|       The following requirements describe step 1 of the above procedure in
 | |
|       more detail.
 | |
| <p class=req id=H35140>
 | |
| When required to attempt to detect a <i>hot-journal file</i>, SQLite
 | |
| shall first use the xAccess() method of the VFS layer to check if a
 | |
| journal file exists in the file-system.
 | |
| <p class=req id=H35510>
 | |
| If the call to xAccess() required by H35140 fails (due to an IO error or
 | |
| similar), then SQLite shall abandon the attempt to open a <i>read-only
 | |
| transaction</i>, relinquish the <i>shared lock</i> held on the database
 | |
| file and return an error to the user.
 | |
| <p class=req id=H35150>
 | |
| When required to attempt to detect a <i>hot-journal file</i>, if the
 | |
| call to xAccess() required by H35140 indicates that a journal file does
 | |
| not exist, then SQLite shall conclude that there is no <i>hot-journal
 | |
| file</i> in the file system and therefore that no <i>hot journal
 | |
| rollback</i> is required.
 | |
|     <p>
 | |
|       The following requirements describe step 2 of the above procedure in
 | |
|       more detail.
 | |
| <p class=req id=H35160>
 | |
| When required to attempt to detect a <i>hot-journal file</i>, if the
 | |
| call to xAccess() required by H35140 indicates that a journal file
 | |
| is present, then the xCheckReservedLock() method of the database file
 | |
| file-handle is invoked to determine whether or not some other
 | |
| process is holding a <i>reserved</i> or greater lock on the database
 | |
| file.
 | |
| <p class=req id=H35520>
 | |
| If the call to xCheckReservedLock() required by H35160 fails (due to an
 | |
| IO or other internal VFS error), then SQLite shall abandon the attempt
 | |
| to open a <i>read-only transaction</i>, relinquish the <i>shared lock</i>
 | |
| held on the database file and return an error to the user.
 | |
| <p class=req id=H35170>
 | |
| If the call to xCheckReservedLock() required by H35160 indicates that
 | |
| some other <i>database connection</i> is holding a <i>reserved</i>
 | |
| or greater lock on the database file, then SQLite shall conclude that
 | |
| there is no <i>hot journal file</i>. In this case the attempt to detect
 | |
| a <i>hot journal file</i> is concluded.
 | |
|     <p>
 | |
|       The following requirements describe step 3 of the above procedure in
 | |
|       more detail.
 | |
| <p class=req id=H35440>
 | |
| If while attempting to detect a <i>hot-journal file</i> the call to
 | |
| xCheckReservedLock() indicates that no process holds a <i>reserved</i>
 | |
| or greater lock on the <i>database file</i>, then SQLite shall open
 | |
| a file handle on the potentially hot journal file using the VFS xOpen()
 | |
| method.
 | |
| <p class=req id=H35530>
 | |
| If the call to xOpen() required by H35440 fails (due to an IO or other
 | |
| internal VFS error), then SQLite shall abandon the attempt to open a
 | |
| <i>read-only transaction</i>, relinquish the <i>shared lock</i> held on
 | |
| the database file and return an error to the user.
 | |
| <p class=req id=H35450>
 | |
| After successfully opening a file-handle on a potentially hot journal
 | |
| file, SQLite shall query the file for its size in bytes using the
 | |
| xFileSize() method of the open file handle.
 | |
| <p class=req id=H35540>
 | |
| If the call to xFileSize() required by H35450 fails (due to an IO or
 | |
| other internal VFS error), then SQLite shall abandon the attempt to open
 | |
| a <i>read-only transaction</i>, relinquish the <i>shared lock</i> held on
 | |
| the database file, close the file handle opened on the journal file and
 | |
| return an error to the user.
 | |
| <p class=req id=H35460>
 | |
| If the size of a potentially hot journal file is revealed to be zero
 | |
| bytes by a query required by H35450, then SQLite shall close the
 | |
| file handle opened on the journal file and delete the journal file using
 | |
| a call to the VFS xDelete() method. In this case SQLite shall conclude
 | |
| that there is no <i>hot journal file</i>.
 | |
| <p class=req id=H35550>
 | |
| If the call to xDelete() required by H35450 fails (due to an IO or
 | |
| other internal VFS error), then SQLite shall abandon the attempt to open
 | |
| a <i>read-only transaction</i>, relinquish the <i>shared lock</i> held on
 | |
| the database file and return an error to the user.
 | |
|     <p>
 | |
|       The following requirements describe step 4 of the above procedure in
 | |
|       more detail.
 | |
| <p class=req id=H35470>
 | |
| If the size of a potentially hot journal file is revealed to be greater
 | |
| than zero bytes by a query required by H35450, then SQLite shall attempt
 | |
| to upgrade the <i>shared lock</i> held by the <i>database connection</i>
 | |
| on the <i>database file</i> directly to an <i>exclusive lock</i>.
 | |
| <p class=req id=H35480>
 | |
| If an attempt to upgrade to an <i>exclusive lock</i> prescribed by
 | |
| H35470 fails for any reason, then SQLite shall release all locks held by
 | |
| the <i>database connection</i> and close the file handle opened on the
 | |
| <i>journal file</i>. The attempt to open a <i>read-only transaction</i>
 | |
| shall be deemed to have failed and an error returned to the user.
 | |
|     <p>
 | |
|       Finally, the following requirements describe step 5 of the above
 | |
|       procedure in more detail.
 | |
| <p class=req id=H35490>
 | |
| If, as part of the <i>hot journal file</i> detection process, the
 | |
| attempt to upgrade to an <i>exclusive lock</i> mandated by H35470 is
 | |
| successful, then SQLite shall query the file-system using the xAccess()
 | |
| method of the VFS implementation to test whether or not the journal
 | |
| file is still present in the file-system.
 | |
| <p class=req id=H35560>
 | |
| If the call to xAccess() required by H35490 fails (due to an IO or
 | |
| other internal VFS error), then SQLite shall abandon the attempt to open
 | |
| a <i>read-only transaction</i>, relinquish the lock held on the
 | |
| database file, close the file handle opened on the journal file and
 | |
| return an error to the user.
 | |
| <p class=req id=H35570>
 | |
| If the call to xAccess() required by H35490 reveals that the journal
 | |
| file is no longer present in the file system, then SQLite shall abandon
 | |
| the attempt to open a <i>read-only transaction</i>, relinquish the
 | |
| lock held on the database file, close the file handle opened on the
 | |
| journal file and return an SQLITE_BUSY error to the user.
 | |
| <p class=req id=H35500>
 | |
| If the xAccess() query required by H35490 reveals that the journal
 | |
| file is still present in the file system, then SQLite shall conclude
 | |
| that the journal file is a <i>hot journal file</i> that needs to
 | |
| be rolled back. SQLite shall immediately begin <i>hot journal
 | |
| rollback</i>.
 | |
|   <h3 id=cache_validation>Cache Validation</h3>
 | |
|     <p>
 | |
|       When a <i>database connection</i> opens a <i>read transaction</i>, the
 | |
|       <i>page cache</i> may already contain data associated with the
 | |
|       <i>database connection</i>. However, if another process has modified 
 | |
|       the database file since the cached pages were loaded it is possible that
 | |
|       the cached data is invalid.
 | |
|     <p>
 | |
|       SQLite determines whether or not the <i>page cache</i> entries belonging
 | |
|       to the <i>database connection</i> are valid or not using the <i>file
 | |
|       change counter</i>, a field in the <i>database file header</i>. The
 | |
|       <i>file change counter</i> is a 4-byte big-endian integer field stored
 | |
|       starting at byte offset 24 of the <i>database file header</i>. Before the
 | |
|       conclusion of a <i>read/write transaction</i> that modifies the contents
 | |
|       of the database file in any way (see section <cite>writing_data</cite>),
 | |
|       the value stored in the <i>file change counter</i> is incremented.  When
 | |
|       a <i>database connection</i> unlocks the database file, it stores the
 | |
|       current value of the <i>file change counter</i>. Later, while opening a
 | |
|       new <i>read-only transaction</i>, SQLite checks the value of the <i>file
 | |
|       change counter</i> stored in the database file. If the value has not
 | |
|       changed since the database file was unlocked, then the <i>page cache</i>
 | |
|       entries can be trusted. If the value has changed, then the <i>page
 | |
|       cache</i> entries cannot be trusted and all entries associated with
 | |
|       the current <i>database connection</i> are discarded.
 | |
| <p class=req id=H35180>
 | |
| When a file-handle open on a database file is unlocked, if the
 | |
| <i>page cache</i> contains one or more entries belonging to the
 | |
| associated <i>database connection</i>, SQLite shall store the value
 | |
| of the <i>file change counter</i> internally.
 | |
| <p class=req id=H35190>
 | |
| When required to perform <i>cache validation</i> as part of opening
 | |
| a <i>read transaction</i>, SQLite shall read a 16 byte block
 | |
| starting at byte offset 24 of the <i>database file</i> using the xRead()
 | |
| method of the <i>database connections</i> file handle.
 | |
|     <p class=todo>
 | |
|       Why a 16 byte block? Why not 4? (something to do with encrypted
 | |
|       databases).
 | |
| <p class=req id=H35200>
 | |
| While performing <i>cache validation</i>, after loading the 16 byte
 | |
| block as required by H35190, SQLite shall compare the 32-bit big-endian
 | |
| integer stored in the first 4 bytes of the block to the most
 | |
| recently stored value of the <i>file change counter</i> (see H35180).
 | |
| If the values are not the same, then SQLite shall conclude that
 | |
| the contents of the cache are invalid.
 | |
|     <p>
 | |
|       Requirement H35050 (section <cite>open_read_only_trans</cite>) 
 | |
|       specifies the action SQLite is required to take upon determining that 
 | |
|       the cache contents are invalid.
 | |
|   <h3 id=read_page_one>Page 1 and the Expected Page Size</h3>
 | |
|     <p>
 | |
|       As the last step in opening a <i>read transaction</i> on a database
 | |
|       file that is more than 0 bytes in size, SQLite is required to load 
 | |
|       data for page 1 of the database into the <i>page cache</i>, if it is 
 | |
|       not already there. This is slightly more complicated than it seems, 
 | |
|       as the database <i>page-size</i> is no known at this point.
 | |
|     <p>
 | |
|       Even though the database <i>page-size</i> cannot be known for sure,
 | |
|       SQLite is usually able to guess correctly by assuming it to be equal to
 | |
|       the connections <i>expected page size</i>. The <i>expected page size</i>
 | |
|       is the value of the <i>page-size</i> field read from the 
 | |
|       <i>database file header</i> while opening the database connection 
 | |
|       (see section <cite>open_new_connection</cite>), or the <i>page-size</i>
 | |
|       of the database file when the most <i>read transaction</i> was concluded.
 | |
| <p class=req id=H35210>
 | |
| During the conclusion of a <i>read transaction</i>, before unlocking
 | |
| the database file, SQLite shall set the connections
 | |
| <i>expected page size</i> to the current database <i>page-size</i>.
 | |
| <p class=req id=H35220>
 | |
| As part of opening a new <i>read transaction</i>, immediately after
 | |
| performing <i>cache validation</i>, if there is no data for database
 | |
| page 1 in the <i>page cache</i>, SQLite shall read <i>N</i> bytes from
 | |
| the start of the database file using the xRead() method of the
 | |
| connections file handle, where <i>N</i> is the connections current
 | |
| <i>expected page size</i> value.
 | |
| <p class=req id=H35230>
 | |
| If page 1 data is read as required by H35230, then the value of the
 | |
| <i>page-size</i> field that appears in the database file header that
 | |
| consumes the first 100 bytes of the read block is not the same as the
 | |
| connections current <i>expected page size</i>, then the
 | |
| <i>expected page size</i> is set to this value, the database file is
 | |
| unlocked and the entire procedure to open a <i>read transaction</i>
 | |
| is repeated.
 | |
| <p class=req id=H35240>
 | |
| If page 1 data is read as required by H35230, then the value of the
 | |
| <i>page-size</i> field that appears in the database file header that
 | |
| consumes the first 100 bytes of the read block is the same as the
 | |
| connections current <i>expected page size</i>, then the block of data
 | |
| read is stored in the <i>page cache</i> as page 1.
 | |
|   <h2>Reading Database Data</h2>
 | |
|   <p class=todo>
 | |
|     Add something about checking the page-cache first etc.
 | |
|   <h2>Ending a Read-only Transaction</h2>
 | |
|     <p>
 | |
|       To end a <i>read-only transaction</i>, SQLite simply relinquishes the
 | |
|       <i>shared lock</i> on the file-handle open on the database file. No
 | |
|       other action is required.
 | |
| <p class=req id=H35130>
 | |
| When required to end a <i>read-only transaction</i>, SQLite shall
 | |
| relinquish the <i>shared lock</i> held on the database file by
 | |
| calling the xUnlock() method of the file-handle.
 | |
|     <p>
 | |
|       See also requirements H35180 and H35210 above.
 | |
| <h1 id=writing_data>Writing Data</h1>
 | |
|   <p>
 | |
|     Using DDL or DML SQL statements, SQLite users may modify the contents and
 | |
|     size of a database file. Exactly how changes to the logical database are
 | |
|     translated to modifications to the database file is described in 
 | |
|     <cite>ff_sqlitert_requirements</cite>. From the point of view of the
 | |
|     sub-systems described in this document, each DDL or DML statement executed
 | |
|     results in the contents of zero or more database file pages being 
 | |
|     overwritten with new data. A DDL or DML statement may also append or 
 | |
|     truncate one or more pages to or from the end of the database file. One 
 | |
|     or more DDL and/or DML statements are grouped together to make up a 
 | |
|     single <i>write transaction</i>. A <i>write transaction</i> is required 
 | |
|     to have the special properties described in section <cite>overview</cite>; 
 | |
|     a <i>write transaction</i> must be isolated, durable and atomic.
 | |
|   <p>
 | |
|     SQLite accomplishes these goals using the following techniques:
 | |
|   <ul>
 | |
|     <li><p>
 | |
|         To ensure that <i>write transactions</i> are <b>isolated</b>, before
 | |
|         beginning to modify the contents of the database file to reflect the
 | |
|         results of a <i>write transaction</i>, SQLite obtains an <i>exclusive
 | |
|         lock</i> on the <i>database file</i>. The lock is not relinquished
 | |
|         until the <i>write transaction</i> is concluded. Because reading from
 | |
|         the <i>database file</i> requires a <i>shared lock</i> (see section
 | |
|         <cite>reading_data</cite>) and holding an <i>exclusive
 | |
|         lock</i> guarantees that no other <i>database connection</i> is holding
 | |
|         or can obtain a <i>shared lock</i>, this ensures that no other
 | |
|         connection may read data from the <i>database file</i> at a point when
 | |
|         a <i>write transaction</i> has been partially applied. 
 | |
|     <li><p>Ensuring that <i>write transactions</i> are <b>atomic</b> is the most
 | |
|         complex task required of the system. In this case, <i>atomic</i> means
 | |
|         that even if a system failure occurs, an attempt to commit a <i>write
 | |
|         transaction</i> to the database file either results in all changes
 | |
|         that are a part of the transaction being successfully applied to the
 | |
|         database file, or none of the changes are successfully applied. There
 | |
|         is no chance that a subset of the changes only are applied. Hence from
 | |
|         the point of view of an external observer, the <i>write transaction</i>
 | |
|         appears to be an atomic event. 
 | |
|         <p>
 | |
|         Of course, it is usually not possible to atomically apply all the
 | |
|         changes required by a <i>write transaction</i> to a database file
 | |
|         within the file-system. For example, if a <i>write transaction</i>
 | |
|         requires ten pages of a database file to be modified, and a power
 | |
|         outage causes a system failure after sqlite has modified only five
 | |
|         pages, then the database file will almost certainly be in an
 | |
|         inconsistent state following system recovery.
 | |
|         <p>
 | |
|         SQLite solves this problem by using a <i>journal file</i>. In almost
 | |
|         all cases, before the <i>database file</i> is modified in any way, 
 | |
|         SQLite stores sufficient information in the <i>journal file</i> to
 | |
|         allow the original the database file to be reconstructed if a system
 | |
|         failure occurs while the database file is being updated to reflect
 | |
|         the modifications made by the <i>write transaction</i>. Each time
 | |
|         SQLite opens a database file, it checks if such a system failure has
 | |
|         occurred and, if so, 
 | |
|         reconstructs the database file based on the contents
 | |
|         of the journal file. The procedure used to detect whether or not this
 | |
|         process, coined <i>hot journal rollback</i>, is required is described
 | |
|         in section <cite>hot_journal_detection</cite>. <i>Hot journal rollback
 | |
|         </i> itself is described in section <cite>hot_journal_rollback</cite>.
 | |
|         <p>
 | |
|         The same technique ensures that an SQLite database file cannot be
 | |
|         corrupted by a system failure that occurs at an inopportune moment.
 | |
|         If a system failure does occur before SQLite has had a chance to
 | |
|         execute sufficient <i>sync file</i> operations to ensure that the
 | |
|         changes that make up a <i>write transaction</i> have made it safely
 | |
|         to persistent storage, then the <i>journal file</i> will be used
 | |
|         to restore the database to a known good state following system
 | |
|         recovery.
 | |
|     <li><p>
 | |
|         So that <i>write transactions</i> are <b>durable</b> in the face of
 | |
|         a system failure, SQLite executes a <i>sync file</i> operation on the
 | |
|         database file before concluding the <i>write transaction</i>
 | |
|   </ul>
 | |
|   <p>
 | |
|     The <i>page cache</i> is used to buffer modifications to the database
 | |
|     file image before they are written to the <i>database file</i>. When
 | |
|     the contents of a page is required to be modified as the results of
 | |
|     an operation within a <i>write transaction</i>, the modified copy is
 | |
|     stored in the <i>page cache</i>. Similarly, if new pages are appended
 | |
|     to the end of a database file, they are added to the <i>page cache</i>
 | |
|     instead of being immediately written to the database file within the
 | |
|     file-system. 
 | |
|   <p>
 | |
|     Ideally, all changes for an entire write transaction are buffered in
 | |
|     the page cache until the end of the transaction. When the user commits
 | |
|     the transaction, all changes are applied to the database file in the
 | |
|     most efficient way possible, taking into account the assumptions 
 | |
|     enumerated in section <cite>fs_performance</cite>. Unfortunately, since
 | |
|     main-memory is a limited resource, this is not always possible for 
 | |
|     large transactions. In this case changes are buffered in the <i>page
 | |
|     cache</i> until some internal condition or limit is reached,
 | |
|     then written out to the database file in order to free resources
 | |
|     as they are required. Section <cite>page_cache_algorithms</cite>
 | |
|     describes the circumstances under which changes are flushed through
 | |
|     to the database file mid-transaction to free <i>page cache</i> resources.
 | |
|   <p>
 | |
|     Even if an application or system failure does not occur while a
 | |
|     <i>write transaction</i> is in progress, a rollback operation to restore
 | |
|     the database file and <i>page cache</i> to the state that it was in before
 | |
|     the transaction started may be required. This may occur if the user
 | |
|     explicitly requests transaction rollback (by issuing a "ROLLBACK" command),
 | |
|     or automatically, as a result of encountering an SQL constraint (see
 | |
|     <cite>sql_sqlitert_requirements</cite>). For this reason, the original page
 | |
|     content is stored in the <i>journal file</i> before the page is even
 | |
|     modified within the <i>page cache</i>.
 | |
|   <p class=todo>
 | |
|     Introduce the following sub-sections.
 | |
|   <h2 id=journal_file_format>Journal File Format</h2>
 | |
|     <p>
 | |
|       This section describes the format used by an SQLite <i>journal file</i>.
 | |
|     <p>
 | |
|       A journal file consists of one or more <i>journal headers</i>, zero
 | |
|       or more <i>journal records</i> and optionally a <i>master journal
 | |
|       pointer</i>. Each journal file always begins with a
 | |
|       <i>journal header</i>, followed by zero or more <i>journal records</i>.
 | |
|       Following this may be a second <i>journal header</i> followed by a
 | |
|       second set of zero or more <i>journal records</i> and so on. There
 | |
|       is no limit to the number of <i>journal headers</i> a journal file
 | |
|       may contain. Following the <i>journal headers</i> and their accompanying
 | |
|       sets of <i>journal records</i> may be the optional <i>master journal
 | |
|       pointer</i>. Or, the file may simply end following the final <i>journal
 | |
|       record</i>.
 | |
|     <p>
 | |
|       This section only describes the format of the journal file and the
 | |
|       various objects that make it up. But because a journal file may be
 | |
|       read by an SQLite process following recovery from a system failure
 | |
|       (<i>hot journal rollback</i>, see section
 | |
|       <cite>hot_journal_rollback</cite>) it is also important to describe
 | |
|       the way the file is created and populated within the file-system
 | |
|       using a combination of <i>write file</i>, <i>sync file</i> and
 | |
|       <i>truncate file</i> operations. These are described in section
 | |
|       <cite>write_transactions</cite>.
 | |
|     <h3 id=journal_header_format>Journal Header Format</h3>
 | |
|     <p>
 | |
|       A <i>journal header</i> is <i>sector-size</i> bytes in size, where <i>
 | |
|       sector-size</i> is the value returned by the xSectorSize method of
 | |
|       the file handle opened on the database file. Only the first 28 bytes
 | |
|       of the <i>journal header</i> are used, the remainder may contain garbage
 | |
|       data. The first 28 bytes of each <i>journal header</i> consists of an 
 | |
|       eight byte block set to a well-known value, followed by five big-endian 
 | |
|       32-bit unsigned integer fields.
 | |
|     <center><img src="images/fileformat/journal_header.gif">
 | |
|     <p><i>Figure <span class=fig id=figure_journal_header></span> - Journal Header Format</i>
 | |
|       </center>
 | |
|     <p>
 | |
|       Figure <cite>figure_journal_header</cite> graphically depicts the layout
 | |
|       of a <i>journal header</i>. The individual fields are described in
 | |
|       the following table. The offsets in the 'byte offset' column of the
 | |
|       table are relative to the start of the <i>journal header</i>.
 | |
|     <table class=striped>
 | |
|       <tr><th>Byte offset<th>Size in bytes<th width=100%>Description
 | |
|       <tr><td>0<td>8<td>The <b>journal magic</b> field always contains a
 | |
|                         well-known 8-byte string value used to identify SQLite
 | |
|                         journal files. The well-known sequence of byte values
 | |
|                         is:
 | |
|                         <pre>0xd9 0xd5 0x05 0xf9 0x20 0xa1 0x63 0xd7</pre>
 | |
|       <tr><td>8<td>4<td>This field, the <b>record count</b>, is set to the
 | |
|                         number of <i>journal records</i> that follow this
 | |
|                         <i>journal header</i> in the <i>journal file</i>.
 | |
|       <tr><td>12<td>4<td>The <b>checksum initializer</b> field is set to a 
 | |
|                          pseudo-random value. It is used as part of the
 | |
|                          algorithm to calculate the checksum for all <i>journal
 | |
|                          records</i> that follow this <i>journal header</i>.
 | |
|       <tr><td>16<td>4<td>This field, the <b>database page count</b>, is set
 | |
|                          to the number of pages that the <i>database file</i>
 | |
|                          contained before any modifications associated with
 | |
|                          <i>write transaction</i> are applied.
 | |
|       <tr><td>20<td>4<td>This field, the <b>sector size</b>, is set to the
 | |
|                          <i>sector size</i> of the device on which the 
 | |
|                          <i>journal file</i> was created, in bytes. This value
 | |
|                          is required when reading the journal file to determine
 | |
|                          the size of each <i>journal header</i>.
 | |
|       <tr><td>24<td>4<td>The <b>page size</b> field contains the database page
 | |
|                          size used by the corresponding <i>database file</i>
 | |
|                          when the <i>journal file</i> was created, in bytes.
 | |
|     </table>
 | |
|     <p>
 | |
|       All <i>journal headers</i> are positioned in the file so that they 
 | |
|       start at a <i>sector size</i> aligned offset. To achieve this, unused
 | |
|       space may be left between the start of the second and subsequent
 | |
|       <i>journal headers</i> and the end of the <i>journal records</i>
 | |
|       associated with the previous header.
 | |
|   <h3 id=journal_record_format>Journal Record Format</h3>
 | |
|     <p>
 | |
|       Each <i>journal record</i> contains the original data for a database page
 | |
|       modified by the <i>write transaction</i>. If a rollback is required, then
 | |
|       this data may be used to restore the contents of the database page to the
 | |
|       state it was in before the <i>write transaction</i> was started.
 | |
|     <center><img src="images/fileformat/journal_record.gif">
 | |
|     <p><i>Figure <span class=fig id=figure_journal_record></span> - Journal Record Format</i>
 | |
|       </center>
 | |
|     <p>
 | |
|       A <i>journal record</i>, depicted graphically by figure
 | |
|       <cite>figure_journal_record</cite>, contains three fields, as described
 | |
|       in the following table. Byte offsets are relative to the start of the
 | |
|       <i>journal record</i>.
 | |
|     <table class=striped>
 | |
|       <tr><th>Byte offset<th>Size in bytes<th width=100%>Description
 | |
|       <tr><td>0<td>4<td>The page number of the database page associated with
 | |
|                         this <i>journal record</i>, stored as a 4 byte
 | |
|                         big-endian unsigned integer.
 | |
|       <tr><td>4<td><i>page-size<td>
 | |
|                         This field contains the original data for the page,
 | |
|                         exactly as it appeared in the database file before the
 | |
|                         <i>write transaction</i> began.
 | |
|       <tr><td style="white-space: nowrap">4 + <i>page-size</i><td>4<td>
 | |
|                         This field contains a checksum value, calculated based
 | |
|                         on the contents of the journaled database page data
 | |
|                         (the previous field) and the values stored in the
 | |
|                         <i>checksum initializer</i> field of the preceding
 | |
|                         <i>journal header</i>.
 | |
|     </table>
 | |
|     <p>
 | |
|       The set of <i>journal records</i> that follow a <i>journal header</i>
 | |
|       in a <i>journal file</i> are packed tightly together. There are no
 | |
|       alignment requirements for <i>journal records</i> as there are for
 | |
|       <i>journal headers</i>.
 | |
|   <h3>Master Journal Pointer</h3>
 | |
|     <p>
 | |
|       To support <i>atomic</i> transactions that modify more than one 
 | |
|       database file, SQLite sometimes includes a <i>master journal pointer</i>
 | |
|       record in a <i>journal file</i>. Multiple file transactions are 
 | |
|       described in section <cite>multifile_transactions</cite>. A <i>
 | |
|       master journal pointer</i> contains the name of a <i>master journal-file
 | |
|       </i> along with a check-sum and some well known values that allow
 | |
|       the <i>master journal pointer</i> to be recognized as such when
 | |
|       the <i>journal file</i> is read during a rollback operation (section
 | |
|       <cite>rollback</cite>).
 | |
|     <p>
 | |
|       As is the case for a <i>journal header</i>, the start of a <i>master
 | |
|       journal pointer</i> is always positioned at a <i>sector size</i> 
 | |
|       aligned offset. If the <i>journal record</i> or <i>journal header</i>
 | |
|       that appears immediately before the <i>master journal pointer</i> does
 | |
|       not end at an aligned offset, then unused space is left between the
 | |
|       end of the <i>journal record</i> or <i>journal header</i> and the start
 | |
|       of the <i>master journal pointer</i>.
 | |
|     <center><img src="images/fileformat/master_journal_ptr.gif">
 | |
|     <p><i>Figure <span class=fig id=figure_master_journal_ptr></span> - Master Journal Pointer Format</i>
 | |
|       </center>
 | |
|     <p>
 | |
|       A <i>master journal pointer</i>, depicted graphically by figure
 | |
|       <cite>figure_master_journal_ptr</cite>, contains five fields, as 
 | |
|       described in the following table. Byte offsets are relative to the 
 | |
|       start of the <i>master journal pointer</i>.
 | |
|     <table class=striped>
 | |
|       <tr><th>Byte offset<th>Size in bytes<th width=100%>Description
 | |
|       <tr><td>0<td>4<td>This field, the <b>locking page number</b>, is always
 | |
|                set to the page number of the database <i>locking page</i>
 | |
|                stored as a 4-byte big-endian integer. The <i>locking page</i>
 | |
|                is the page that begins at byte offset 2<super>30</super> of the
 | |
|                database file. Even if the database file is large enough to
 | |
|                contain the <i>locking page</i>, the <i>locking page</i> is
 | |
|                never used to store any data and so the first four bytes of of a
 | |
|                valid <i>journal record</i> will never contain this value. For
 | |
|                further description of the <i>locking page</i>, refer to
 | |
|                <cite>ff_sqlitert_requirements</cite>.
 | |
|       <tr><td>4<td><i>name-length</i><td>
 | |
|                The <b>master journal name</b> field contains the name of the
 | |
|                master journal file, encoded as a utf-8 string. There is no
 | |
|                nul-terminator appended to the string.
 | |
|       <tr><td>4 + <i>name-length</i><td><i>4<td>
 | |
|                The <b>name-length</b> field contains the length of the 
 | |
|                previous field in bytes, formatted as a 4-byte big-endian 
 | |
|                unsigned integer.
 | |
|       <tr><td>8 + <i>name-length</i><td><i>4<td>
 | |
|                The <b>checksum</b> field contains a checksum value stored as
 | |
|                a 4-byte big-endian signed integer. The checksum value is
 | |
|                calculated as the sum of the bytes that make up the <i>
 | |
|                master journal name</i> field, interpreting each byte as
 | |
|                an 8-bit signed integer.
 | |
|       <tr><td style="white-space: nowrap">12 + <i>name-length</i><td><i>8<td>
 | |
|                Finally, the <b>journal magic</b> field always contains a
 | |
|                well-known 8-byte string value; the same value stored in the
 | |
|                first 8 bytes of a <i>journal header</i>. The well-known
 | |
|                sequence of bytes is:
 | |
|                  <pre>0xd9 0xd5 0x05 0xf9 0x20 0xa1 0x63 0xd7</pre>
 | |
|     </table>
 | |
|   <h2 id=write_transactions>Write Transactions</h2>
 | |
|     <p>
 | |
|       This section describes the progression of an SQLite <i>write
 | |
|       transaction</i>. From the point of view of the systems described in
 | |
|       this document, most <i>write transactions</i> consist of three steps:
 | |
|     <ol>
 | |
|       <li><p>The <i>write transaction</i> is opened. This process is described
 | |
|           in section <cite>opening_a_write_transaction</cite>.
 | |
|       <li><p>The end-user executes DML or DDL SQL statements that require the
 | |
|           structure of the database file of the database file to be modified.
 | |
|           These modifications may be any combination of operations to 
 | |
|           <ul><li>modify the content of an existing database page, 
 | |
|               <li>append a new database page to the database file image, or
 | |
|               <li>truncate (discard) a database page from the end of the
 | |
|                   database file. 
 | |
|           </ul>
 | |
|           These operations are described in detail in section
 | |
|           <cite>modifying_appending_truncating</cite>. How user DDL or DML
 | |
|           SQL statements are mapped to combinations of these three operations
 | |
|           is described in <cite>ff_sqlitert_requirements</cite>.
 | |
|       <li><p>The <i>write transaction</i> is concluded and the changes made
 | |
|           permanently committed to the database. The process required to 
 | |
|           commit a transaction is described in section
 | |
|           <cite>committing_a_transaction</cite>.
 | |
|     </ol>
 | |
|     <p>
 | |
|       As an alternative to step 3 above, the transaction may be rolled back.
 | |
|       Transaction rollback is described in section <cite>rollback</cite>.
 | |
|       Finally, it is also important to remember that a <i>write transaction</i>
 | |
|       may be interrupted by a <i>system failure</i> at any point. In this
 | |
|       case, the contents of the file-system (the <i>database file</i> and
 | |
|       <i>journal file</i>) must be left in such a state so as to enable
 | |
|       the <i>database file</i> to be restored to the state it was in before
 | |
|       the interrupted <i>write transaction</i> was started. This is known
 | |
|       as <i>hot journal rollback</i>, and is described in section
 | |
|       <cite>hot_journal_rollback</cite>. Section
 | |
|       <cite>fs_assumption_details</cite> describes the assumptions made 
 | |
|       regarding the effects of a <i>system failure</i> on the file-system
 | |
|       contents following recovery.
 | |
|   <h3 id=opening_a_write_transaction>Beginning a Write Transaction</h3>
 | |
|     <p>
 | |
|       Before any database pages may be modified within the <i>page cache</i>,
 | |
|       the <i>database connection</i> must open a <i>write transaction</i>. 
 | |
|       Opening a <i>write transaction</i> requires that the <i>database
 | |
|       connection</i> obtains a <i>reserved lock</i> (or greater) on the 
 | |
|       <i>database file</i>. Because obtaining a <i>reserved lock</i> on
 | |
|       a <i>database file</i> guarantees that no other <i>database
 | |
|       connection</i> may hold or obtain a <i>reserved lock</i> or greater,
 | |
|       it follows that no other <i>database connection</i> may have an
 | |
|       open <i>write transaction</i>.
 | |
|     <p>
 | |
|       A <i>reserved lock</i> on the <i>database file</i> may be thought of
 | |
|       as an exclusive lock on the <i>journal file</i>. No 
 | |
|       <i>database connection</i> may read from or write to a <i>journal
 | |
|       file</i> without a <i>reserved</i> or greater lock on the corresponding
 | |
|       <i>database file</i>.
 | |
|     <p>
 | |
|       Before opening a <i>write transaction</i>, a <i>database connection</i>
 | |
|       must have an open <i>read transaction</i>, opened via the procedure
 | |
|       described in section <cite>open_read_only_trans</cite>. This ensures
 | |
|       that there is no <i>hot-journal file</i> that needs to be rolled back
 | |
|       and that any data stored in the <i>page cache</i> can be trusted.
 | |
|     <p>
 | |
|       Once a <i>read transaction</i> has been opened, upgrading to a 
 | |
|       <i>write transaction</i> is a two step process, as follows:
 | |
|     <ol>
 | |
|       <li>A <i>reserved lock</i> is obtained on the <i>database file</i>.
 | |
|       <li>The <i>journal file</i> is opened and created if necessary (using 
 | |
|           the VFS xOpen method), and a <i>journal file header</i> written 
 | |
|           to the start of it using a single call to the file handles xWrite 
 | |
|           method.
 | |
|     </ol>
 | |
|     <p>
 | |
|       Requirements describing step 1 of the above procedure in detail:
 | |
| <p class=req id=H35350>
 | |
| When required to open a <i>write transaction</i> on the database,
 | |
| SQLite shall first open a <i>read transaction</i>, if the <i>database
 | |
| connection</i> in question has not already opened one.
 | |
| <p class=req id=H35360>
 | |
| When required to open a <i>write transaction</i> on the database, after
 | |
| ensuring a <i>read transaction</i> has already been opened, SQLite
 | |
| shall obtain a <i>reserved lock</i> on the database file by calling
 | |
| the xLock method of the file-handle open on the database file.
 | |
| <p class=req id=H35580>
 | |
| If an attempt to acquire a <i>reserved lock</i> prescribed by
 | |
| requirement H35360 fails, then SQLite shall deem the attempt to
 | |
| open a <i>write transaction</i> to have failed and return an error
 | |
| to the user.
 | |
|     <p>
 | |
|       Requirements describing step 2 of the above procedure in detail:
 | |
| <p class=req id=H35370>
 | |
| When required to open a <i>write transaction</i> on the database, after
 | |
| obtaining a <i>reserved lock</i> on the database file, SQLite shall
 | |
| open a read/write file-handle on the corresponding <i>journal file</i>.
 | |
| <p class=req id=H35380>
 | |
| When required to open a <i>write transaction</i> on the database, after
 | |
| opening a file-handle on the <i>journal file</i>, SQLite shall append
 | |
| a <i>journal header</i> to the (currently empty) <i>journal file</i>.
 | |
|     <h4 id=writing_journal_header>Writing a Journal Header</h4>
 | |
|     <p>
 | |
|       Requirements describing how a <i>journal header</i> is appended to
 | |
|       a journal file:
 | |
| <p class=req id=H35680>
 | |
| When required to append a <i>journal header</i> to the <i>journal
 | |
| file</i>, SQLite shall do so by writing a block of <i>sector-size</i>
 | |
| bytes using a single call to the xWrite method of the file-handle
 | |
| open on the <i>journal file</i>. The block of data written shall begin
 | |
| at the smallest sector-size aligned offset at or following the current
 | |
| end of the <i>journal file</i>.
 | |
| <p class=req id=H35690>
 | |
| The first 8 bytes of the <i>journal header</i> required to be written
 | |
| by H35680 shall contain the following values, in order from byte offset 0
 | |
| to 7: 0xd9, 0xd5, 0x05, 0xf9, 0x20, 0xa1, 0x63 and 0xd7.
 | |
| <p class=req id=H35700>
 | |
| Bytes 8-11 of the <i>journal header</i> required to be written by
 | |
| H35680 shall contain 0x00.
 | |
| <p class=req id=H35710>
 | |
| Bytes 12-15 of the <i>journal header</i> required to be written by
 | |
| H35680 shall contain the number of pages that the database file
 | |
| contained when the current <i>write-transaction</i> was started,
 | |
| formatted as a 4-byte big-endian unsigned integer.
 | |
| <p class=req id=H35720>
 | |
| Bytes 16-19 of the <i>journal header</i> required to be written by
 | |
| H35680 shall contain pseudo-randomly generated values.
 | |
| <p class=req id=H35730>
 | |
| Bytes 20-23 of the <i>journal header</i> required to be written by
 | |
| H35680 shall contain the <i>sector size</i> used by the VFS layer,
 | |
| formatted as a 4-byte big-endian unsigned integer.
 | |
| <p class=req id=H35740>
 | |
| Bytes 24-27 of the <i>journal header</i> required to be written by
 | |
| H35680 shall contain the <i>page size</i> used by the database at
 | |
| the start of the <i>write transaction</i>, formatted as a 4-byte
 | |
| big-endian unsigned integer.
 | |
|   <h3 id=modifying_appending_truncating>
 | |
|     Modifying, Adding or Truncating a Database Page
 | |
|   </h3>
 | |
|     <p>
 | |
|       When the end-user executes a DML or DDL SQL statement to modify the
 | |
|       database schema or content, SQLite is required to update the database
 | |
|       file image to reflect the new database state. This involves modifying
 | |
|       the content of, appending or truncating one of more database file 
 | |
|       pages. Instead of modifying the database file directly using the VFS
 | |
|       interface, changes are first buffered within the <i>page cache</i>.
 | |
|     <p>
 | |
|       Before modifying a database page within the <i>page cache</i> that
 | |
|       may need to be restored by a rollback operation, the page must be
 | |
|       <i>journalled</i>. <i>Journalling a page</i> is the process of copying
 | |
|       that pages original data into the journal file so that it can be
 | |
|       recovered if the <i>write transaction</i> is rolled back. The process
 | |
|       of journalling a page is described in section 
 | |
|       <cite>journalling_a_page</cite>.
 | |
| <p class=req id=H35590>
 | |
| When required to modify the contents of an existing database page that
 | |
| existed and was not a <i>free-list leaf page</i> when the <i>write
 | |
| transaction</i> was opened, SQLite shall journal the page if it has not
 | |
| already been journalled within the current <i>write transaction</i>.
 | |
| <p class=req id=H35600>
 | |
| When required to modify the contents of an existing database page,
 | |
| SQLite shall update the cached version of the database page content
 | |
| stored as part of the <i>page cache entry</i> associated with the page.
 | |
|     <p>
 | |
|       When a new database page is appended to a database file, there is
 | |
|       no requirement to add a record to the <i>journal file</i>. If a 
 | |
|       rollback is required the database file will simply be truncated back 
 | |
|       to its original size based on the value stored at byte offset 12
 | |
|       of the <i>journal file</i>.
 | |
| <p class=req id=H35610>
 | |
| When required to append a new database page to the database file,
 | |
| SQLite shall create a new <i>page cache entry</i> corresponding to
 | |
| the new page and insert it into the <i>page cache</i>. The <i>dirty
 | |
| flag</i> of the new <i>page cache entry</i> shall be set.
 | |
|     <p>
 | |
|       If required to truncate a database page from the end of the database
 | |
|       file, the associated <i>page cache entry</i> is discarded. The adjusted
 | |
|       size of the database file is stored internally. The database file
 | |
|       is not actually truncated until the current <i>write transaction</i>
 | |
|       is committed (see section <cite>committing_a_transaction</cite>).
 | |
| <p class=req id=H35620>
 | |
| When required to truncate (remove) a database page that existed and was
 | |
| not a <i>free-list leaf page</i> when the <i>write transaction</i> was
 | |
| opened from the end of a database file, SQLite shall journal the page if
 | |
| it has not already been journalled within the current <i>write
 | |
| transaction</i>.
 | |
| <p class=req id=H35630>
 | |
| When required to truncate a database page from the end of the database
 | |
| file, SQLite shall discard the associated <i>page cache entry</i>
 | |
| from the page cache.
 | |
|   <h4 id=journalling_a_page>Journalling a Database Page</h4>
 | |
|     <p>
 | |
|       A page is journalled by adding a <i>journal record</i> to the <i>
 | |
|       journal file</i>. The format of a <i>journal record</i> is described
 | |
|       in section <cite>journal_record_format</cite>.
 | |
| <p class=req id=H35270>
 | |
| When required to <i>journal a database page</i>, SQLite shall first
 | |
| append the <i>page number</i> of the page being journalled to the
 | |
| <i>journal file</i>, formatted as a 4-byte big-endian unsigned integer,
 | |
| using a single call to the xWrite method of the file-handle opened
 | |
| on the journal file.
 | |
| <p class=req id=H35280>
 | |
| When required to <i>journal a database page</i>, if the attempt to
 | |
| append the <i>page number</i> to the journal file is successful,
 | |
| then the current page data (<i>page-size</i> bytes) shall be appended
 | |
| to the journal file, using a single call to the xWrite method of the
 | |
| file-handle opened on the journal file.
 | |
| <p class=req id=H35290>
 | |
| When required to <i>journal a database page</i>, if the attempt to
 | |
| append the current page data to the journal file is successful,
 | |
| then SQLite shall append a 4-byte big-endian integer checksum value
 | |
| to the to the journal file, using a single call to the xWrite method
 | |
| of the file-handle opened on the journal file.
 | |
|     <p>
 | |
|       The checksum value written to the <i>journal file</i> immediately after
 | |
|       the page data (requirement H35290), is a function of both the page
 | |
|       data and the <i>checksum initializer</i> field stored in the 
 | |
|       <i>journal header</i> (see section <cite>journal_header_format</cite>).
 | |
|       Specifically, it is the sum of the <i>checksum initializer</i> and
 | |
|       the value of every 200th byte of page data interpreted as an 8-bit
 | |
|       unsigned integer, starting with the (<i>page-size</i> % 200)'th 
 | |
|       byte of page data. For example, if the <i>page-size</i> is 1024 bytes,
 | |
|       then a checksum is calculated by adding the values of the bytes at
 | |
|       offsets 23, 223, 423, 623, 823 and 1023 (the last byte of the page)
 | |
|       together with the value of the <i>checksum initializer</i>.
 | |
| <p class=req id=H35300>
 | |
| The checksum value written to the <i>journal file</i> by the write
 | |
| required by H35290 shall be equal to the sum of the <i>checksum
 | |
| initializer</i> field stored in the <i>journal header</i> (H35700) and
 | |
| every 200th byte of the page data, beginning with the
 | |
| (<i>page-size</i> % 200)th byte.
 | |
|     <p>
 | |
|       The '%' character is used in requirement H35300 to represent the
 | |
|       modulo operator, just as it is in programming languages such as C, Java
 | |
|       and Javascript.
 | |
|   <h3 id=syncing_journal_file>Syncing the Journal File</h3>
 | |
|     <p>
 | |
|       Even after the original data of a database page has been written into
 | |
|       the journal file using calls to the journal file file-handle xWrite 
 | |
|       method (section <cite>journalling_a_page</cite>), it is still not
 | |
|       safe to write to the page within the database file. This is because
 | |
|       in the event of a system failure the data written to the journal file
 | |
|       may still be corrupted (see section <cite>fs_characteristics</cite>).
 | |
|       Before the page can be updated within the database itself, the 
 | |
|       following procedure takes place:
 | |
|     <ol>
 | |
|       <li> The xSync method of the file-handle opened on the journal file 
 | |
|            is called. This operation ensures that all <i>journal records</i>
 | |
|            in the journal file have been written to persistent storage, and
 | |
|            that they will not become corrupted as a result of a subsequent
 | |
|            system failure.
 | |
|       <li> The <i>journal record count</i> field (see section 
 | |
|            <cite>journal_header_format</cite>) of the most recently written
 | |
|            journal header in the journal file is updated to contain the
 | |
|            number of <i>journal records</i> added to the journal file since
 | |
|            the header was written.
 | |
|       <li> The xSync method is called again, to ensure that the update to
 | |
|            the <i>journal record count</i> has been committed to persistent
 | |
|            storage.
 | |
|     </ol> 
 | |
|     <p>
 | |
|       If all three of the steps enumerated above are executed successfully,
 | |
|       then it is safe to modify the content of the <i>journalled</i> 
 | |
|       database pages within the database file itself. The combination of
 | |
|       the three steps above is referred to as <i>syncing the journal file</i>.
 | |
| <p class=req id=H35750>
 | |
| When required to <i>sync the journal file</i>, SQLite shall invoke the
 | |
| xSync method of the file handle open on the <i>journal file</i>.
 | |
| <p class=req id=H35760>
 | |
| When required to <i>sync the journal file</i>, after invoking the
 | |
| xSync method as required by H35750, SQLite shall update the <i>record
 | |
| count</i> of the <i>journal header</i> most recently written to the
 | |
| <i>journal file</i>. The 4-byte field shall be updated to contain
 | |
| the number of <i>journal records</i> that have been written to the
 | |
| <i>journal file</i> since the <i>journal header</i> was written,
 | |
| formatted as a 4-byte big-endian unsigned integer.
 | |
| <p class=req id=H35770>
 | |
| When required to <i>sync the journal file</i>, after updating the
 | |
| <i>record count</i> field of a <i>journal header</i> as required by
 | |
| H35760, SQLite shall invoke the xSync method of the file handle open
 | |
| on the <i>journal file</i>.
 | |
|   <h3 id=upgrading_to_exclusive_lock>Upgrading to an Exclusive Lock</h3>
 | |
|     <p>
 | |
|       Before the content of a page modified within the <i>page cache</i> may
 | |
|       be written to the database file, an <i>exclusive lock</i> must be held
 | |
|       on the database file. The purpose of this lock is to prevent another
 | |
|       connection from reading from the database file while the first 
 | |
|       connection is midway through writing to it. Whether the reason for
 | |
|       writing to the database file is because a transaction is being committed,
 | |
|       or to free up space within the <i>page cache</i>, upgrading to an 
 | |
|       <i>exclusive lock</i> always occurs immediately after 
 | |
|       <i>syncing the journal file</i>.
 | |
| <p class=req id=H35780>
 | |
| When required to upgrade to an <i>exclusive lock</i> as part of a write
 | |
| transaction, SQLite shall first attempt to obtain a <i>pending lock</i>
 | |
| on the database file if one is not already held by invoking the xLock
 | |
| method of the file handle opened on the <i>database file</i>.
 | |
| <p class=req id=H35790>
 | |
| When required to upgrade to an <i>exclusive lock</i> as part of a write
 | |
| transaction, after successfully obtaining a <i>pending lock</i> SQLite
 | |
| shall attempt to obtain an <i>exclusive lock</i> by invoking the
 | |
| xLock method of the file handle opened on the <i>database file</i>.
 | |
|     <p class=todo>
 | |
|       What happens if the exclusive lock cannot be obtained? It is not
 | |
|       possible for the attempt to upgrade from a reserved to a pending 
 | |
|       lock to fail.
 | |
|   <h3 id=committing_a_transaction>Committing a Transaction</h3>
 | |
|     <p>
 | |
|       Committing a <i>write transaction</i> is the final step in updating the
 | |
|       database file. Committing a transaction is a seven step process,
 | |
|       summarized as follows:
 | |
|     <ol>
 | |
|       <li><p>
 | |
|         The database file header <i>change counter</i> field is incremented.
 | |
|         The <i>change counter</i>, described in
 | |
|         <cite>ff_sqlitert_requirements</cite>, is used by the <i>cache
 | |
|         validation</i> procedure described in section
 | |
|         <cite>cache_validation</cite>.
 | |
|       <li><p> 
 | |
|         The <i>journal file</i> is synced. The steps required to <i>sync the
 | |
|         journal file</i> are described in section
 | |
|         <cite>syncing_journal_file</cite>.
 | |
|       <li><p>
 | |
|         Upgrade to an <i>exclusive lock</i> on the database file, if an
 | |
|         <i>exclusive lock</i> is not already held. Upgrading to an 
 | |
|         <i>exclusive lock</i> is described in section
 | |
|         <cite>upgrading_to_exclusive_lock</cite>.
 | |
|       <li><p> 
 | |
|         Copy the contents of all <i>dirty pages</i> stored in the <i>page
 | |
|         cache</i> into the database file. The set of dirty pages are written
 | |
|         to the database file in page number order in order to improve
 | |
|         performance (see the assumptions in section <cite>fs_performance</cite>
 | |
|         for details).
 | |
|       <li><p>
 | |
|         The database file is synced to ensure that all updates are stored
 | |
|         safely on the persistent media.
 | |
|       <li><p>
 | |
|         The file-handle open on the <i>journal file</i> is closed and the
 | |
|         journal file itself deleted. At this point the <i>write transaction</i>
 | |
|         transaction has been irrevocably committed.
 | |
|       <li><p>
 | |
|         The database file is unlocked.
 | |
|     </ol>
 | |
|     <p class=todo>
 | |
|       Expand on and explain the above a bit.
 | |
|     <p>
 | |
|       The following requirements describe the steps enumerated above in more
 | |
|       detail.
 | |
| <p class=req id=H35800>
 | |
| When required to <i>commit a write-transaction</i>, SQLite shall
 | |
| modify page 1 to increment the value stored in the <i>change counter</i>
 | |
| field of the <i>database file header</i>.
 | |
|     <p>
 | |
|       The <i>change counter</i> is a 4-byte big-endian integer field stored
 | |
|       at byte offset 24 of the <i>database file</i>. The modification to page 1
 | |
|       required by H35800 is made using the process described in section
 | |
|       <cite>modifying_appending_truncating</cite>. If page 1 has not already
 | |
|       been journalled as a part of the current write-transaction, then
 | |
|       incrementing the <i>change counter</i> may require that page 1 be
 | |
|       journalled. In all cases the <i>page cache entry</i> corresponding to
 | |
|       page 1 becomes a <i>dirty page</i> as part of incrementing the <i>change
 | |
|       counter</i> value.
 | |
| <p class=req id=H35810>
 | |
| When required to <i>commit a write-transaction</i>, after incrementing
 | |
| the <i>change counter</i> field, SQLite shall <i>sync the journal
 | |
| file</i>.
 | |
| <p class=req id=H35820>
 | |
| When required to <i>commit a write-transaction</i>, after <i>syncing
 | |
| the journal file</i> as required by H35810, if an <i>exclusive lock</i>
 | |
| on the database file is not already held, SQLite shall attempt to
 | |
| <i>upgrade to an exclusive lock</i>.
 | |
| <p class=req id=H35830>
 | |
| When required to <i>commit a write-transaction</i>, after <i>syncing
 | |
| the journal file</i> as required by H35810 and ensuring that an
 | |
| <i>exclusive lock</i> is held on the database file as required by
 | |
| H35830, SQLite shall copy the contents of all <i>dirty page</i>
 | |
| stored in the <i>page cache</i> into the <i>database file</i> using
 | |
| calls to the xWrite method of the <i>database connection</i> file
 | |
| handle. Each call to xWrite shall write the contents of a single
 | |
| <i>dirty page</i> (<i>page-size</i> bytes of data) to the database
 | |
| file. Dirty pages shall be written in order of <i>page number</i>,
 | |
| from lowest to highest.
 | |
| <p class=req id=H35840>
 | |
| When required to <i>commit a write-transaction</i>, after copying the
 | |
| contents of any <i>dirty pages</i> to the database file as required
 | |
| by H35830, SQLite shall sync the database file by invoking the xSync
 | |
| method of the <i>database connection</i> file handle.
 | |
| <p class=req id=H35850>
 | |
| When required to <i>commit a write-transaction</i>, after syncing
 | |
| the database file as required by H35840, SQLite shall close the
 | |
| file-handle opened on the <i>journal file</i> and delete the
 | |
| <i>journal file</i> from the file system via a call to the VFS
 | |
| xDelete method.
 | |
| <p class=req id=H35860>
 | |
| When required to <i>commit a write-transaction</i>, after deleting
 | |
| the <i>journal file</i> as required by H35850, SQLite shall relinquish
 | |
| all locks held on the <i>database file</i> by invoking the xUnlock
 | |
| method of the <i>database connection</i> file handle.
 | |
|     <p class=todo>
 | |
|       Is the shared lock held after committing a <i>write transaction</i>?
 | |
|   <h3>Purging a Dirty Page</h3>
 | |
|     <p>
 | |
|       Usually, no data is actually written to the database file until the
 | |
|       user commits the active <i>write transaction</i>. The exception is
 | |
|       if a single <i>write transaction</i> contains too many modifications
 | |
|       to be stored in the <i>page cache</i>. In this case, some of the 
 | |
|       database file modifications stored in the <i>page cache</i> must be
 | |
|       applied to the database file before the transaction is committed so
 | |
|       that the associated <i>page cache entries</i> can be purged from the
 | |
|       page cache to free memory. Exactly when this condition is reached and
 | |
|       dirty pages must be purged is described in section
 | |
|       <cite>page_cache_algorithms</cite>.
 | |
|     <p>
 | |
|       Before the contents of the <i>page cache entry</i> can be written into
 | |
|       the database file, the <i>page cache entry</i> must meet the criteria
 | |
|       for a <i>writable dirty page</i>, as defined in section
 | |
|       <cite>page_cache_algorithms</cite>. If the dirty page selected by the
 | |
|       algorithms in section <cite>page_cache_algorithms</cite> for purging,
 | |
|       SQLite is required to <i>sync the journal file</i>. Immediately after
 | |
|       the journal file is synced, all dirty pages associated with the
 | |
|       <i>database connection</i> are classified as <i>writable dirty pages</i>.
 | |
| <p class=req id=H35640>
 | |
| When required to purge a <i>non-writable dirty page</i> from the
 | |
| <i>page cache</i>, SQLite shall <i>sync the journal file</i> before
 | |
| proceeding with the write operation required by H35670.
 | |
| <p class=req id=H35660>
 | |
| After <i>syncing the journal file</i> as required by H35640, SQLite
 | |
| shall append a new <i>journal header</i> to the <i>journal file</i>
 | |
| before proceeding with the write operation required by H35670.
 | |
|     <p>
 | |
|       Appending a new <i>journal header</i> to the journal file is described
 | |
|       in section <cite>writing_journal_header</cite>.
 | |
|     <p>
 | |
|       Once the dirty page being purged is writable, it is simply written
 | |
|       into the database file.
 | |
| <p class=req id=H35670>
 | |
| When required to purge a <i>page cache entry</i> that is a
 | |
| <i>dirty page</i> SQLite shall write the page data into the database
 | |
| file, using a single call to the xWrite method of the <i>database
 | |
| connection</i> file handle.
 | |
|   <h2 id="multifile_transactions">Multi-File Transactions</h2>
 | |
|   <h2 id="statement_transactions">Statement Transactions</h2>
 | |
| <h1 id=rollback>Rollback</h1>
 | |
|   <h2 id=hot_journal_rollback>Hot Journal Rollback</h2>
 | |
|   <h2>Transaction Rollback</h2>
 | |
|   <h2>Statement Rollback</h2>
 | |
| <h1>References</h1>
 | |
|   <table id="refs" style="width:auto; margin: 1em 5ex">
 | |
|     <tr><td style="width:5ex" id="capi_sqlitert_requirements">[1]<td>
 | |
|       C API Requirements Document.
 | |
|     <tr><td style="width:5ex" id="sql_sqlitert_requirements">[2]<td>
 | |
|       SQL Requirements Document.
 | |
|     <tr><td style="width:5ex" id="ff_sqlitert_requirements">[3]<td>
 | |
|       File Format Requirements Document.
 | |
|   </table>
 | |
| 
 | |
| 
 |