353 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			HTML
		
	
	
	
	
	
			
		
		
	
	
			353 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			HTML
		
	
	
	
	
	
| <!DOCTYPE html>
 | |
| <html><head>
 | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0">
 | |
| <meta http-equiv="content-type" content="text/html; charset=UTF-8">
 | |
| <link href="sqlite.css" rel="stylesheet">
 | |
| <title>Measuring and Reducing CPU Usage in SQLite</title>
 | |
| <!-- path= -->
 | |
| </head>
 | |
| <body>
 | |
| <div class=nosearch>
 | |
| <a href="index.html">
 | |
| <img class="logo" src="images/sqlite370_banner.gif" alt="SQLite" border="0">
 | |
| </a>
 | |
| <div><!-- IE hack to prevent disappearing logo --></div>
 | |
| <div class="tagline desktoponly">
 | |
| Small. Fast. Reliable.<br>Choose any three.
 | |
| </div>
 | |
| <div class="menu mainmenu">
 | |
| <ul>
 | |
| <li><a href="index.html">Home</a>
 | |
| <li class='mobileonly'><a href="javascript:void(0)" onclick='toggle_div("submenu")'>Menu</a>
 | |
| <li class='wideonly'><a href='about.html'>About</a>
 | |
| <li class='desktoponly'><a href="docs.html">Documentation</a>
 | |
| <li class='desktoponly'><a href="download.html">Download</a>
 | |
| <li class='wideonly'><a href='copyright.html'>License</a>
 | |
| <li class='desktoponly'><a href="support.html">Support</a>
 | |
| <li class='desktoponly'><a href="prosupport.html">Purchase</a>
 | |
| <li class='search' id='search_menubutton'>
 | |
| <a href="javascript:void(0)" onclick='toggle_search()'>Search</a>
 | |
| </ul>
 | |
| </div>
 | |
| <div class="menu submenu" id="submenu">
 | |
| <ul>
 | |
| <li><a href='about.html'>About</a>
 | |
| <li><a href='docs.html'>Documentation</a>
 | |
| <li><a href='download.html'>Download</a>
 | |
| <li><a href='support.html'>Support</a>
 | |
| <li><a href='prosupport.html'>Purchase</a>
 | |
| </ul>
 | |
| </div>
 | |
| <div class="searchmenu" id="searchmenu">
 | |
| <form method="GET" action="search">
 | |
| <select name="s" id="searchtype">
 | |
| <option value="d">Search Documentation</option>
 | |
| <option value="c">Search Changelog</option>
 | |
| </select>
 | |
| <input type="text" name="q" id="searchbox" value="">
 | |
| <input type="submit" value="Go">
 | |
| </form>
 | |
| </div>
 | |
| </div>
 | |
| <script>
 | |
| function toggle_div(nm) {
 | |
| var w = document.getElementById(nm);
 | |
| if( w.style.display=="block" ){
 | |
| w.style.display = "none";
 | |
| }else{
 | |
| w.style.display = "block";
 | |
| }
 | |
| }
 | |
| function toggle_search() {
 | |
| var w = document.getElementById("searchmenu");
 | |
| if( w.style.display=="block" ){
 | |
| w.style.display = "none";
 | |
| } else {
 | |
| w.style.display = "block";
 | |
| setTimeout(function(){
 | |
| document.getElementById("searchbox").focus()
 | |
| }, 30);
 | |
| }
 | |
| }
 | |
| function div_off(nm){document.getElementById(nm).style.display="none";}
 | |
| window.onbeforeunload = function(e){div_off("submenu");}
 | |
| /* Disable the Search feature if we are not operating from CGI, since */
 | |
| /* Search is accomplished using CGI and will not work without it. */
 | |
| if( !location.origin || !location.origin.match || !location.origin.match(/http/) ){
 | |
| document.getElementById("search_menubutton").style.display = "none";
 | |
| }
 | |
| /* Used by the Hide/Show button beside syntax diagrams, to toggle the */
 | |
| function hideorshow(btn,obj){
 | |
| var x = document.getElementById(obj);
 | |
| var b = document.getElementById(btn);
 | |
| if( x.style.display!='none' ){
 | |
| x.style.display = 'none';
 | |
| b.innerHTML='show';
 | |
| }else{
 | |
| x.style.display = '';
 | |
| b.innerHTML='hide';
 | |
| }
 | |
| return false;
 | |
| }
 | |
| </script>
 | |
| </div>
 | |
| <div class=fancy>
 | |
| <div class=nosearch>
 | |
| <div class="fancy_title">
 | |
| Measuring and Reducing CPU Usage in SQLite
 | |
| </div>
 | |
| <div class="fancy_toc">
 | |
| <a onclick="toggle_toc()">
 | |
| <span class="fancy_toc_mark" id="toc_mk">►</span>
 | |
| Table Of Contents
 | |
| </a>
 | |
| <div id="toc_sub"><div class="fancy-toc1"><a href="#overview">1. Overview</a></div>
 | |
| <div class="fancy-toc1"><a href="#measuring_performance">2. Measuring Performance</a></div>
 | |
| <div class="fancy-toc2"><a href="#compile_options">2.1. Compile Options</a></div>
 | |
| <div class="fancy-toc2"><a href="#workload">2.2. Workload</a></div>
 | |
| <div class="fancy-toc2"><a href="#performance_measurement">2.3. Performance Measurement</a></div>
 | |
| <div class="fancy-toc2"><a href="#microoptimizations">2.4. Microoptimizations</a></div>
 | |
| <div class="fancy-toc1"><a href="#performance_measurement_workflow">3. Performance Measurement Workflow</a></div>
 | |
| <div class="fancy-toc1"><a href="#limitations">4. Limitations</a></div>
 | |
| </div>
 | |
| </div>
 | |
| <script>
 | |
| function toggle_toc(){
 | |
| var sub = document.getElementById("toc_sub")
 | |
| var mk = document.getElementById("toc_mk")
 | |
| if( sub.style.display!="block" ){
 | |
| sub.style.display = "block";
 | |
| mk.innerHTML = "▼";
 | |
| } else {
 | |
| sub.style.display = "none";
 | |
| mk.innerHTML = "►";
 | |
| }
 | |
| }
 | |
| </script>
 | |
| </div>
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| <h1 id="overview"><span>1. </span>Overview</h1>
 | |
| 
 | |
| <p>The graph below shows the number of CPU cycles used by SQLite on a
 | |
| standard workload, for versions of SQLite going back about 10 years.
 | |
| Recent versions of SQLite use about one third as many the CPU cycles
 | |
| compared to older versions.
 | |
| 
 | |
| </p><p>
 | |
| This article describes how the SQLite developers measure CPU usage,
 | |
| what those measurements actually mean, and the techniques used by
 | |
| SQLite developers on their continuing quest to further reduce the
 | |
| CPU usage of the SQLite library.
 | |
| </p>
 | |
| 
 | |
| <center>
 | |
| <hr>
 | |
| <div class="imgcontainer">
 | |
| <img src="./images/cpu-usage.jpg"></div><br>
 | |
| Measured using cachegrind on Ubuntu 16.04 on x64 with gcc 5.4.0 and -Os.<br>
 | |
| <hr>
 | |
| </center>
 | |
| 
 | |
| <h1 id="measuring_performance"><span>2. </span>Measuring Performance</h1>
 | |
| 
 | |
| <p>In brief, the CPU performance of SQLite is measured as follows:
 | |
| 
 | |
| </p><p></p><ol>
 | |
| <li> Compile SQLite in an as-delivered configuration, without any special
 | |
|      telemetry or debugging options.
 | |
| </li><li> Link SQLite against a test program that runs approximately 30,000
 | |
|      SQL statements representing a typical workload.
 | |
| </li><li> Count the number of CPU cycles consumed using
 | |
|      <a href="http://valgrind.org/docs/manual/cg-manual.html">cachegrind</a>.
 | |
| </li></ol>
 | |
| 
 | |
| <h2 id="compile_options"><span>2.1. </span>Compile Options</h2>
 | |
| 
 | |
| <p>For performance measurement, SQLite is compiled in approximately the same
 | |
| way as it would be for use in production systems.  The compile-time configuration
 | |
| is "approximate" in the sense that every production use of SQLite is 
 | |
| different. Compile-time options used by one system are not necessarily
 | |
| the same as those used by others.  The key point is that options that 
 | |
| significantly impact the generated machine code are avoided.  For example,
 | |
| the -DSQLITE_DEBUG option is omitted because that option inserts thousands
 | |
| of assert() statements in the middle of performance critical sections of the
 | |
| SQLite library.  The -pg option (on GCC) is omitted because it causes the
 | |
| compiler to emit extra probabilistic performance measuring code which interferes
 | |
| with actual performance measurements.
 | |
| 
 | |
| </p><p>
 | |
| For performance measurements,
 | |
| the -Os option is used (optimize for size) rather than -O2 because the
 | |
| -O2 option creates so much code movement that it is difficult to associate
 | |
| specific CPU instructions to C source code lines.
 | |
| 
 | |
| </p><h2 id="workload"><span>2.2. </span>Workload</h2>
 | |
| 
 | |
| <p>
 | |
| The "typical" workload is generated by the
 | |
| <a href="https://sqlite.org/src/file/test/speedtest1.c">speedtest1.c</a>
 | |
| program in the canonical SQLite source tree.  This program strives to
 | |
| exercise the SQLite library in a way that is typical of real-world
 | |
| applications.  Of course, every application is different, and so
 | |
| no test program can exactly mirror the behavior of all applications.
 | |
| 
 | |
| </p><p>
 | |
| The speedtest1.c program is updated from time to time as the SQLite
 | |
| developers' understanding of what constitutes "typical" usage evolves.
 | |
| 
 | |
| </p><p>
 | |
| The 
 | |
| <a href="https://sqlite.org/src/file/tool/speed-check.sh">speed-check.sh</a> shell
 | |
| script, also in the canonical source tree, is used to run the speedtest1.c
 | |
| program.  To replicate the performance measurements, collect the following
 | |
| files into a single directory:
 | |
| </p><ul>
 | |
| <li> the "speed-check.sh" script,
 | |
| </li><li> the "speedtest1.c" test program, and
 | |
| </li><li> the <a href="amalgamation.html">SQLite amalgamation</a> source files "sqlite3.c" and
 | |
|      "sqlite3.h"
 | |
| </li></ul>
 | |
| <p>
 | |
| Then run "sh speed-check.sh trunk".
 | |
| 
 | |
| 
 | |
| </p><h2 id="performance_measurement"><span>2.3. </span>Performance Measurement</h2>
 | |
| 
 | |
| <p>
 | |
| <a href="http://valgrind.org/docs/manual/cg-manual.html">Cachegrind</a> is used to
 | |
| measure performance because it gives answers that are repeatable to 
 | |
| 7 or more significant digits.  In comparison, actual (wall-clock)
 | |
| run times are scarcely repeatable beyond one significant digit.
 | |
| 
 | |
| <a name="microopt"></a>
 | |
| 
 | |
| </p><h2 id="microoptimizations"><span>2.4. </span>Microoptimizations</h2>
 | |
| 
 | |
| <p>
 | |
| The high repeatability of cachegrind allows the SQLite developers to
 | |
| implement and measure "microoptimizations".  A microoptimization is
 | |
| a change to the code that results in a very small performance increase.
 | |
| Typical micro-optimizations reduce the number of CPU cycles by 0.1% or
 | |
| 0.05% or even less.  Such improvements are impossible to measure with
 | |
| real-world timings.  But hundreds or thousands of microoptimizations
 | |
| add up, resulting in measurable real-world performance gains.
 | |
| 
 | |
| </p><h1 id="performance_measurement_workflow"><span>3. </span>Performance Measurement Workflow</h1>
 | |
| 
 | |
| <p>
 | |
| As SQLite developers edit the SQLite source code, they run the
 | |
| <a href="https://sqlite.org/src/file/tool/speed-check.sh">speed-check.sh</a>
 | |
| shell script to track the performance impact of changes.  This
 | |
| script compiles the speedtest1.c program, runs it under cachegrind,
 | |
| processes the cachegrind output using the
 | |
| <a href="https://sqlite.org/src/file/tool/cg_anno.tcl">cg_anno.tcl</a> TCL
 | |
| script, then saves the results in a series of text files.
 | |
| Typical output from the speed-check.sh script looks like this:
 | |
| 
 | |
| </p><blockquote><pre>
 | |
| ==8683== 
 | |
| ==8683== I   refs:      <font color="red">1,060,925,768</font>
 | |
| ==8683== I1  misses:       23,731,246
 | |
| ==8683== LLi misses:            5,176
 | |
| ==8683== I1  miss rate:          2.24%
 | |
| ==8683== LLi miss rate:          0.00%
 | |
| ==8683== 
 | |
| ==8683== D   refs:        557,686,925  (361,828,925 rd   + 195,858,000 wr)
 | |
| ==8683== D1  misses:        5,067,063  (  3,544,278 rd   +   1,522,785 wr)
 | |
| ==8683== LLd misses:           57,958  (     16,067 rd   +      41,891 wr)
 | |
| ==8683== D1  miss rate:           0.9% (        1.0%     +         0.8%  )
 | |
| ==8683== LLd miss rate:           0.0% (        0.0%     +         0.0%  )
 | |
| ==8683== 
 | |
| ==8683== LL refs:          28,798,309  ( 27,275,524 rd   +   1,522,785 wr)
 | |
| ==8683== LL misses:            63,134  (     21,243 rd   +      41,891 wr)
 | |
| ==8683== LL miss rate:            0.0% (        0.0%     +         0.0%  )
 | |
|    text	   data	    bss	    dec	    hex	filename
 | |
|  523044	   8240	   1976	 <font color="red">533260</font>	  8230c	sqlite3.o
 | |
|  220507 1007870 7769352 sqlite3.c
 | |
| </pre></blockquote>
 | |
| 
 | |
| <p>The important parts of the output (the parts that the developers pay
 | |
| the most attention to) are shown in red.
 | |
| Basically, the developers want to know the size of the compiled SQLite
 | |
| library and how many CPU cycles were needed to run the performance test.
 | |
| 
 | |
| </p><p>The output from the 
 | |
| <a href="https://sqlite.org/src/file/tool/cg_anno.tcl">cg_anno.tcl</a> script
 | |
| shows the number of CPU cycles spent on each line of code.
 | |
| The report is approximately 80,000 lines long.  The following is a brief
 | |
| snippet taken from the middle of the report to show what it looks like:
 | |
| 
 | |
| </p><blockquote><pre>
 | |
|          .  SQLITE_PRIVATE int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
 | |
|          .    MemPage *pPage;
 | |
|          .    assert( cursorOwnsBtShared(pCur) );
 | |
|          .    assert( pRes!=0 );
 | |
|          .    assert( *pRes==0 || *pRes==1 );
 | |
|          .    assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
 | |
|    369,648    pCur->info.nSize = 0;
 | |
|    369,648    pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
 | |
|    369,648    *pRes = 0;
 | |
|    739,296    if( pCur->eState!=CURSOR_VALID ) return btreeNext(pCur, pRes);
 | |
|  1,473,580    pPage = pCur->apPage[pCur->iPage];
 | |
|  1,841,975    if( (++pCur->aiIdx[pCur->iPage])>=pPage->nCell ){
 | |
|      4,340      pCur->aiIdx[pCur->iPage]--;
 | |
|      5,593      return btreeNext(pCur, pRes);
 | |
|          .    }
 | |
|    728,110    if( pPage->leaf ){
 | |
|          .      return SQLITE_OK;
 | |
|          .    }else{
 | |
|      3,117      return moveToLeftmost(pCur);
 | |
|          .    }
 | |
|    721,876  }
 | |
| </pre></blockquote>
 | |
| 
 | |
| <p>
 | |
| The numbers on the left are the CPU cycle counts for that line of code,
 | |
| of course.
 | |
| 
 | |
| </p><p>
 | |
| The cg_anno.tcl script removes extraneous details from the default 
 | |
| cachegrind annotation
 | |
| output so that before-and-after reports can be compared using a 
 | |
| side-by-side diff to view specific details of how a
 | |
| micro-optimization attempt affected performance.
 | |
| 
 | |
| 
 | |
| </p><h1 id="limitations"><span>4. </span>Limitations</h1>
 | |
| 
 | |
| <p>The use of the standardized speedtest1.c workload and cachegrind has
 | |
| enabled significant performance improvement.
 | |
| However, it is important to recognize the limitations of this approach:
 | |
| 
 | |
| </p><ul>
 | |
| <li><p>
 | |
| Performance measurements are done with a single compiler (gcc 5.4.0),
 | |
| optimization setting (-Os), and
 | |
| on a single platform (Ubuntu 16.04 LTS on x64).  The performance of
 | |
| other compilers and processors may vary.
 | |
| 
 | |
| </p></li><li><p>
 | |
| The speedtest1.c workload that is being measured tries to be representative
 | |
| of a wide range of typical uses of SQLite.  But every application is
 | |
| different.  The speedtest1.c workload might not be a good proxy for the
 | |
| kinds of activities performed by some applications.  The SQLite developers
 | |
| are constantly working to improve the speedtest1.c program, to make it
 | |
| a better proxy for actual SQLite usage.  Community feedback is welcomed.
 | |
| 
 | |
| </p></li><li><p>
 | |
| The cycle counts provided by cachegrind are a good proxy for actual
 | |
| performance, but they are not 100% accurate.
 | |
| 
 | |
| </p></li><li><p>
 | |
| Only CPU cycle counts are being measured here. 
 | |
| CPU cycle counts are a good proxy for energy consumption,
 | |
| but do not necessary correlate well with real-world timings.
 | |
| Time spent doing I/O is not reflected in the CPU cycle counts,
 | |
| and I/O time predominates in many SQLite usage scenarios.
 | |
| </p></li></ul>
 | |
| 
 |