353 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			HTML
		
	
	
	
	
	
			
		
		
	
	
			353 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			HTML
		
	
	
	
	
	
<!DOCTYPE html>
 | 
						|
<html><head>
 | 
						|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
 | 
						|
<meta http-equiv="content-type" content="text/html; charset=UTF-8">
 | 
						|
<link href="sqlite.css" rel="stylesheet">
 | 
						|
<title>Measuring and Reducing CPU Usage in SQLite</title>
 | 
						|
<!-- path= -->
 | 
						|
</head>
 | 
						|
<body>
 | 
						|
<div class=nosearch>
 | 
						|
<a href="index.html">
 | 
						|
<img class="logo" src="images/sqlite370_banner.gif" alt="SQLite" border="0">
 | 
						|
</a>
 | 
						|
<div><!-- IE hack to prevent disappearing logo --></div>
 | 
						|
<div class="tagline desktoponly">
 | 
						|
Small. Fast. Reliable.<br>Choose any three.
 | 
						|
</div>
 | 
						|
<div class="menu mainmenu">
 | 
						|
<ul>
 | 
						|
<li><a href="index.html">Home</a>
 | 
						|
<li class='mobileonly'><a href="javascript:void(0)" onclick='toggle_div("submenu")'>Menu</a>
 | 
						|
<li class='wideonly'><a href='about.html'>About</a>
 | 
						|
<li class='desktoponly'><a href="docs.html">Documentation</a>
 | 
						|
<li class='desktoponly'><a href="download.html">Download</a>
 | 
						|
<li class='wideonly'><a href='copyright.html'>License</a>
 | 
						|
<li class='desktoponly'><a href="support.html">Support</a>
 | 
						|
<li class='desktoponly'><a href="prosupport.html">Purchase</a>
 | 
						|
<li class='search' id='search_menubutton'>
 | 
						|
<a href="javascript:void(0)" onclick='toggle_search()'>Search</a>
 | 
						|
</ul>
 | 
						|
</div>
 | 
						|
<div class="menu submenu" id="submenu">
 | 
						|
<ul>
 | 
						|
<li><a href='about.html'>About</a>
 | 
						|
<li><a href='docs.html'>Documentation</a>
 | 
						|
<li><a href='download.html'>Download</a>
 | 
						|
<li><a href='support.html'>Support</a>
 | 
						|
<li><a href='prosupport.html'>Purchase</a>
 | 
						|
</ul>
 | 
						|
</div>
 | 
						|
<div class="searchmenu" id="searchmenu">
 | 
						|
<form method="GET" action="search">
 | 
						|
<select name="s" id="searchtype">
 | 
						|
<option value="d">Search Documentation</option>
 | 
						|
<option value="c">Search Changelog</option>
 | 
						|
</select>
 | 
						|
<input type="text" name="q" id="searchbox" value="">
 | 
						|
<input type="submit" value="Go">
 | 
						|
</form>
 | 
						|
</div>
 | 
						|
</div>
 | 
						|
<script>
 | 
						|
function toggle_div(nm) {
 | 
						|
var w = document.getElementById(nm);
 | 
						|
if( w.style.display=="block" ){
 | 
						|
w.style.display = "none";
 | 
						|
}else{
 | 
						|
w.style.display = "block";
 | 
						|
}
 | 
						|
}
 | 
						|
function toggle_search() {
 | 
						|
var w = document.getElementById("searchmenu");
 | 
						|
if( w.style.display=="block" ){
 | 
						|
w.style.display = "none";
 | 
						|
} else {
 | 
						|
w.style.display = "block";
 | 
						|
setTimeout(function(){
 | 
						|
document.getElementById("searchbox").focus()
 | 
						|
}, 30);
 | 
						|
}
 | 
						|
}
 | 
						|
function div_off(nm){document.getElementById(nm).style.display="none";}
 | 
						|
window.onbeforeunload = function(e){div_off("submenu");}
 | 
						|
/* Disable the Search feature if we are not operating from CGI, since */
 | 
						|
/* Search is accomplished using CGI and will not work without it. */
 | 
						|
if( !location.origin || !location.origin.match || !location.origin.match(/http/) ){
 | 
						|
document.getElementById("search_menubutton").style.display = "none";
 | 
						|
}
 | 
						|
/* Used by the Hide/Show button beside syntax diagrams, to toggle the */
 | 
						|
function hideorshow(btn,obj){
 | 
						|
var x = document.getElementById(obj);
 | 
						|
var b = document.getElementById(btn);
 | 
						|
if( x.style.display!='none' ){
 | 
						|
x.style.display = 'none';
 | 
						|
b.innerHTML='show';
 | 
						|
}else{
 | 
						|
x.style.display = '';
 | 
						|
b.innerHTML='hide';
 | 
						|
}
 | 
						|
return false;
 | 
						|
}
 | 
						|
</script>
 | 
						|
</div>
 | 
						|
<div class=fancy>
 | 
						|
<div class=nosearch>
 | 
						|
<div class="fancy_title">
 | 
						|
Measuring and Reducing CPU Usage in SQLite
 | 
						|
</div>
 | 
						|
<div class="fancy_toc">
 | 
						|
<a onclick="toggle_toc()">
 | 
						|
<span class="fancy_toc_mark" id="toc_mk">►</span>
 | 
						|
Table Of Contents
 | 
						|
</a>
 | 
						|
<div id="toc_sub"><div class="fancy-toc1"><a href="#overview">1. Overview</a></div>
 | 
						|
<div class="fancy-toc1"><a href="#measuring_performance">2. Measuring Performance</a></div>
 | 
						|
<div class="fancy-toc2"><a href="#compile_options">2.1. Compile Options</a></div>
 | 
						|
<div class="fancy-toc2"><a href="#workload">2.2. Workload</a></div>
 | 
						|
<div class="fancy-toc2"><a href="#performance_measurement">2.3. Performance Measurement</a></div>
 | 
						|
<div class="fancy-toc2"><a href="#microoptimizations">2.4. Microoptimizations</a></div>
 | 
						|
<div class="fancy-toc1"><a href="#performance_measurement_workflow">3. Performance Measurement Workflow</a></div>
 | 
						|
<div class="fancy-toc1"><a href="#limitations">4. Limitations</a></div>
 | 
						|
</div>
 | 
						|
</div>
 | 
						|
<script>
 | 
						|
function toggle_toc(){
 | 
						|
var sub = document.getElementById("toc_sub")
 | 
						|
var mk = document.getElementById("toc_mk")
 | 
						|
if( sub.style.display!="block" ){
 | 
						|
sub.style.display = "block";
 | 
						|
mk.innerHTML = "▼";
 | 
						|
} else {
 | 
						|
sub.style.display = "none";
 | 
						|
mk.innerHTML = "►";
 | 
						|
}
 | 
						|
}
 | 
						|
</script>
 | 
						|
</div>
 | 
						|
 | 
						|
 | 
						|
 | 
						|
 | 
						|
 | 
						|
<h1 id="overview"><span>1. </span>Overview</h1>
 | 
						|
 | 
						|
<p>The graph below shows the number of CPU cycles used by SQLite on a
 | 
						|
standard workload, for versions of SQLite going back about 10 years.
 | 
						|
Recent versions of SQLite use about one third as many the CPU cycles
 | 
						|
compared to older versions.
 | 
						|
 | 
						|
</p><p>
 | 
						|
This article describes how the SQLite developers measure CPU usage,
 | 
						|
what those measurements actually mean, and the techniques used by
 | 
						|
SQLite developers on their continuing quest to further reduce the
 | 
						|
CPU usage of the SQLite library.
 | 
						|
</p>
 | 
						|
 | 
						|
<center>
 | 
						|
<hr>
 | 
						|
<div class="imgcontainer">
 | 
						|
<img src="./images/cpu-usage.jpg"></div><br>
 | 
						|
Measured using cachegrind on Ubuntu 16.04 on x64 with gcc 5.4.0 and -Os.<br>
 | 
						|
<hr>
 | 
						|
</center>
 | 
						|
 | 
						|
<h1 id="measuring_performance"><span>2. </span>Measuring Performance</h1>
 | 
						|
 | 
						|
<p>In brief, the CPU performance of SQLite is measured as follows:
 | 
						|
 | 
						|
</p><p></p><ol>
 | 
						|
<li> Compile SQLite in an as-delivered configuration, without any special
 | 
						|
     telemetry or debugging options.
 | 
						|
</li><li> Link SQLite against a test program that runs approximately 30,000
 | 
						|
     SQL statements representing a typical workload.
 | 
						|
</li><li> Count the number of CPU cycles consumed using
 | 
						|
     <a href="http://valgrind.org/docs/manual/cg-manual.html">cachegrind</a>.
 | 
						|
</li></ol>
 | 
						|
 | 
						|
<h2 id="compile_options"><span>2.1. </span>Compile Options</h2>
 | 
						|
 | 
						|
<p>For performance measurement, SQLite is compiled in approximately the same
 | 
						|
way as it would be for use in production systems.  The compile-time configuration
 | 
						|
is "approximate" in the sense that every production use of SQLite is 
 | 
						|
different. Compile-time options used by one system are not necessarily
 | 
						|
the same as those used by others.  The key point is that options that 
 | 
						|
significantly impact the generated machine code are avoided.  For example,
 | 
						|
the -DSQLITE_DEBUG option is omitted because that option inserts thousands
 | 
						|
of assert() statements in the middle of performance critical sections of the
 | 
						|
SQLite library.  The -pg option (on GCC) is omitted because it causes the
 | 
						|
compiler to emit extra probabilistic performance measuring code which interferes
 | 
						|
with actual performance measurements.
 | 
						|
 | 
						|
</p><p>
 | 
						|
For performance measurements,
 | 
						|
the -Os option is used (optimize for size) rather than -O2 because the
 | 
						|
-O2 option creates so much code movement that it is difficult to associate
 | 
						|
specific CPU instructions to C source code lines.
 | 
						|
 | 
						|
</p><h2 id="workload"><span>2.2. </span>Workload</h2>
 | 
						|
 | 
						|
<p>
 | 
						|
The "typical" workload is generated by the
 | 
						|
<a href="https://sqlite.org/src/file/test/speedtest1.c">speedtest1.c</a>
 | 
						|
program in the canonical SQLite source tree.  This program strives to
 | 
						|
exercise the SQLite library in a way that is typical of real-world
 | 
						|
applications.  Of course, every application is different, and so
 | 
						|
no test program can exactly mirror the behavior of all applications.
 | 
						|
 | 
						|
</p><p>
 | 
						|
The speedtest1.c program is updated from time to time as the SQLite
 | 
						|
developers' understanding of what constitutes "typical" usage evolves.
 | 
						|
 | 
						|
</p><p>
 | 
						|
The 
 | 
						|
<a href="https://sqlite.org/src/file/tool/speed-check.sh">speed-check.sh</a> shell
 | 
						|
script, also in the canonical source tree, is used to run the speedtest1.c
 | 
						|
program.  To replicate the performance measurements, collect the following
 | 
						|
files into a single directory:
 | 
						|
</p><ul>
 | 
						|
<li> the "speed-check.sh" script,
 | 
						|
</li><li> the "speedtest1.c" test program, and
 | 
						|
</li><li> the <a href="amalgamation.html">SQLite amalgamation</a> source files "sqlite3.c" and
 | 
						|
     "sqlite3.h"
 | 
						|
</li></ul>
 | 
						|
<p>
 | 
						|
Then run "sh speed-check.sh trunk".
 | 
						|
 | 
						|
 | 
						|
</p><h2 id="performance_measurement"><span>2.3. </span>Performance Measurement</h2>
 | 
						|
 | 
						|
<p>
 | 
						|
<a href="http://valgrind.org/docs/manual/cg-manual.html">Cachegrind</a> is used to
 | 
						|
measure performance because it gives answers that are repeatable to 
 | 
						|
7 or more significant digits.  In comparison, actual (wall-clock)
 | 
						|
run times are scarcely repeatable beyond one significant digit.
 | 
						|
 | 
						|
<a name="microopt"></a>
 | 
						|
 | 
						|
</p><h2 id="microoptimizations"><span>2.4. </span>Microoptimizations</h2>
 | 
						|
 | 
						|
<p>
 | 
						|
The high repeatability of cachegrind allows the SQLite developers to
 | 
						|
implement and measure "microoptimizations".  A microoptimization is
 | 
						|
a change to the code that results in a very small performance increase.
 | 
						|
Typical micro-optimizations reduce the number of CPU cycles by 0.1% or
 | 
						|
0.05% or even less.  Such improvements are impossible to measure with
 | 
						|
real-world timings.  But hundreds or thousands of microoptimizations
 | 
						|
add up, resulting in measurable real-world performance gains.
 | 
						|
 | 
						|
</p><h1 id="performance_measurement_workflow"><span>3. </span>Performance Measurement Workflow</h1>
 | 
						|
 | 
						|
<p>
 | 
						|
As SQLite developers edit the SQLite source code, they run the
 | 
						|
<a href="https://sqlite.org/src/file/tool/speed-check.sh">speed-check.sh</a>
 | 
						|
shell script to track the performance impact of changes.  This
 | 
						|
script compiles the speedtest1.c program, runs it under cachegrind,
 | 
						|
processes the cachegrind output using the
 | 
						|
<a href="https://sqlite.org/src/file/tool/cg_anno.tcl">cg_anno.tcl</a> TCL
 | 
						|
script, then saves the results in a series of text files.
 | 
						|
Typical output from the speed-check.sh script looks like this:
 | 
						|
 | 
						|
</p><blockquote><pre>
 | 
						|
==8683== 
 | 
						|
==8683== I   refs:      <font color="red">1,060,925,768</font>
 | 
						|
==8683== I1  misses:       23,731,246
 | 
						|
==8683== LLi misses:            5,176
 | 
						|
==8683== I1  miss rate:          2.24%
 | 
						|
==8683== LLi miss rate:          0.00%
 | 
						|
==8683== 
 | 
						|
==8683== D   refs:        557,686,925  (361,828,925 rd   + 195,858,000 wr)
 | 
						|
==8683== D1  misses:        5,067,063  (  3,544,278 rd   +   1,522,785 wr)
 | 
						|
==8683== LLd misses:           57,958  (     16,067 rd   +      41,891 wr)
 | 
						|
==8683== D1  miss rate:           0.9% (        1.0%     +         0.8%  )
 | 
						|
==8683== LLd miss rate:           0.0% (        0.0%     +         0.0%  )
 | 
						|
==8683== 
 | 
						|
==8683== LL refs:          28,798,309  ( 27,275,524 rd   +   1,522,785 wr)
 | 
						|
==8683== LL misses:            63,134  (     21,243 rd   +      41,891 wr)
 | 
						|
==8683== LL miss rate:            0.0% (        0.0%     +         0.0%  )
 | 
						|
   text	   data	    bss	    dec	    hex	filename
 | 
						|
 523044	   8240	   1976	 <font color="red">533260</font>	  8230c	sqlite3.o
 | 
						|
 220507 1007870 7769352 sqlite3.c
 | 
						|
</pre></blockquote>
 | 
						|
 | 
						|
<p>The important parts of the output (the parts that the developers pay
 | 
						|
the most attention to) are shown in red.
 | 
						|
Basically, the developers want to know the size of the compiled SQLite
 | 
						|
library and how many CPU cycles were needed to run the performance test.
 | 
						|
 | 
						|
</p><p>The output from the 
 | 
						|
<a href="https://sqlite.org/src/file/tool/cg_anno.tcl">cg_anno.tcl</a> script
 | 
						|
shows the number of CPU cycles spent on each line of code.
 | 
						|
The report is approximately 80,000 lines long.  The following is a brief
 | 
						|
snippet taken from the middle of the report to show what it looks like:
 | 
						|
 | 
						|
</p><blockquote><pre>
 | 
						|
         .  SQLITE_PRIVATE int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
 | 
						|
         .    MemPage *pPage;
 | 
						|
         .    assert( cursorOwnsBtShared(pCur) );
 | 
						|
         .    assert( pRes!=0 );
 | 
						|
         .    assert( *pRes==0 || *pRes==1 );
 | 
						|
         .    assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
 | 
						|
   369,648    pCur->info.nSize = 0;
 | 
						|
   369,648    pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
 | 
						|
   369,648    *pRes = 0;
 | 
						|
   739,296    if( pCur->eState!=CURSOR_VALID ) return btreeNext(pCur, pRes);
 | 
						|
 1,473,580    pPage = pCur->apPage[pCur->iPage];
 | 
						|
 1,841,975    if( (++pCur->aiIdx[pCur->iPage])>=pPage->nCell ){
 | 
						|
     4,340      pCur->aiIdx[pCur->iPage]--;
 | 
						|
     5,593      return btreeNext(pCur, pRes);
 | 
						|
         .    }
 | 
						|
   728,110    if( pPage->leaf ){
 | 
						|
         .      return SQLITE_OK;
 | 
						|
         .    }else{
 | 
						|
     3,117      return moveToLeftmost(pCur);
 | 
						|
         .    }
 | 
						|
   721,876  }
 | 
						|
</pre></blockquote>
 | 
						|
 | 
						|
<p>
 | 
						|
The numbers on the left are the CPU cycle counts for that line of code,
 | 
						|
of course.
 | 
						|
 | 
						|
</p><p>
 | 
						|
The cg_anno.tcl script removes extraneous details from the default 
 | 
						|
cachegrind annotation
 | 
						|
output so that before-and-after reports can be compared using a 
 | 
						|
side-by-side diff to view specific details of how a
 | 
						|
micro-optimization attempt affected performance.
 | 
						|
 | 
						|
 | 
						|
</p><h1 id="limitations"><span>4. </span>Limitations</h1>
 | 
						|
 | 
						|
<p>The use of the standardized speedtest1.c workload and cachegrind has
 | 
						|
enabled significant performance improvement.
 | 
						|
However, it is important to recognize the limitations of this approach:
 | 
						|
 | 
						|
</p><ul>
 | 
						|
<li><p>
 | 
						|
Performance measurements are done with a single compiler (gcc 5.4.0),
 | 
						|
optimization setting (-Os), and
 | 
						|
on a single platform (Ubuntu 16.04 LTS on x64).  The performance of
 | 
						|
other compilers and processors may vary.
 | 
						|
 | 
						|
</p></li><li><p>
 | 
						|
The speedtest1.c workload that is being measured tries to be representative
 | 
						|
of a wide range of typical uses of SQLite.  But every application is
 | 
						|
different.  The speedtest1.c workload might not be a good proxy for the
 | 
						|
kinds of activities performed by some applications.  The SQLite developers
 | 
						|
are constantly working to improve the speedtest1.c program, to make it
 | 
						|
a better proxy for actual SQLite usage.  Community feedback is welcomed.
 | 
						|
 | 
						|
</p></li><li><p>
 | 
						|
The cycle counts provided by cachegrind are a good proxy for actual
 | 
						|
performance, but they are not 100% accurate.
 | 
						|
 | 
						|
</p></li><li><p>
 | 
						|
Only CPU cycle counts are being measured here. 
 | 
						|
CPU cycle counts are a good proxy for energy consumption,
 | 
						|
but do not necessary correlate well with real-world timings.
 | 
						|
Time spent doing I/O is not reflected in the CPU cycle counts,
 | 
						|
and I/O time predominates in many SQLite usage scenarios.
 | 
						|
</p></li></ul>
 | 
						|
 |