##SQLite入門與分析(四)---Page Cache之事務處理(3)
寫在前面:由于內容較多,所以斷續沒有寫完的內容。
###11、刪除日志文件(Deleting The Rollback Journal)
一旦更改寫入設備,日志文件將會被刪除,這是事務真正提交的時刻。如果在這之前系統發生崩潰,就會進行恢復處理,使得數據庫和沒發生改變一樣;如果在這之后系統發生崩潰,表明所有的更改都已經寫入磁盤。SQLite就是根據日志存在情況決定是否對數據庫進行恢復處理。
刪除文件本質上不是一個原子操作,但是從用戶進程的角度來看是一個原子操作,所以一個事務看起來是一個原子操作。
在許多系統中,刪除文件也是一個高代價的操作。作為優化,SQLite可以配置成把日志文件的長度截為0或者把日志文件頭清零。

###12、釋放鎖(Releasing The Lock)
作為原子提交的最后一步,釋放排斥鎖使得其它進程可以開始訪問數據庫了。
下圖中,我們指明了當鎖被釋放的時候用戶空間所擁有的信息已經被清空了.對于老版本的SQLite你可這么認為。但最新的SQLite會保存些用戶空間的緩存不會被清空—萬一下一個事務開始的時候,這些數據剛好可以用上呢。重新利用這些內存要比再次從操作系統磁盤緩存或者硬盤中讀取要來得輕松與快捷得多,何樂而不為呢?在再次使用這些數據之前,我們必須先取得一個共享鎖,同時我們還不得不去檢查一下,保證還沒有其他進程在我們擁有共享鎖之前對數據庫文件進行了修改。數據庫文件的第一頁中有一個計數器,數據庫文件每做一次修改,這個計數器就會增長一下。我們可以通過檢查這個計數器就可得知是否有其他進程修改過數據庫文件。如果數據庫文件已經被修改過了,那么用戶內存空間的緩存就不得不清空,并重新讀入。大多數情況下,這種情況不大會發生,因此用戶空間的內存緩存將是有效的,這對于性能提高來說作用是顯著的。

以上兩步是在sqlite3BtreeCommit()---btree.c函數中實現的。
代碼如下:
~~~
//提交事務,至此一個事務完成.主要做兩件事:
//刪除日志文件,釋放數據庫文件的寫鎖
int sqlite3BtreeCommit(Btree *p){
BtShared *pBt = p->pBt;
btreeIntegrity(p);
/* If the handle has a write-transaction open, commit the shared-btrees
** transaction and set the shared state to TRANS_READ.
*/
if( p->inTrans==TRANS_WRITE ){
int rc;
assert( pBt->inTransaction==TRANS_WRITE );
assert( pBt->nTransaction>0 );
//調用pager,提交事務
rc = sqlite3pager_commit(pBt->pPager);
if( rc!=SQLITE_OK ){
return rc;
}
pBt->inTransaction = TRANS_READ;
pBt->inStmt = 0;
}
unlockAllTables(p);
/* If the handle has any kind of transaction open, decrement the transaction
** count of the shared btree. If the transaction count reaches 0, set
** the shared state to TRANS_NONE. The unlockBtreeIfUnused() call below
** will unlock the pager.
*/
if( p->inTrans!=TRANS_NONE ){
pBt->nTransaction--;
if( 0==pBt->nTransaction ){
pBt->inTransaction = TRANS_NONE;
}
}
}
//提交事務,主要調用pager_unwritelock()函數
int sqlite3pager_commit(Pager *pPager){
int rc;
PgHdr *pPg;
if( pPager->errCode ){
return pPager->errCode;
}
if( pPager->state<PAGER_RESERVED ){
return SQLITE_ERROR;
}
TRACE2("COMMIT %d\n", PAGERID(pPager));
if( MEMDB ){
pPg = pager_get_all_dirty_pages(pPager);
while( pPg ){
clearHistory(PGHDR_TO_HIST(pPg, pPager));
pPg->dirty = 0;
pPg->inJournal = 0;
pPg->inStmt = 0;
pPg->needSync = 0;
pPg->pPrevStmt = pPg->pNextStmt = 0;
pPg = pPg->pDirty;
}
pPager->pDirty = 0;
#ifndef NDEBUG
for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
PgHistory *pHist = PGHDR_TO_HIST(pPg, pPager);
assert( !pPg->alwaysRollback );
assert( !pHist->pOrig );
assert( !pHist->pStmt );
}
#endif
pPager->pStmt = 0;
pPager->state = PAGER_SHARED;
return SQLITE_OK;
}
if( pPager->dirtyCache==0 ){
/* Exit early (without doing the time-consuming sqlite3OsSync() calls)
** if there have been no changes to the database file. */
assert( pPager->needSync==0 );
rc = pager_unwritelock(pPager);
pPager->dbSize = -1;
return rc;
}
assert( pPager->journalOpen );
rc = sqlite3pager_sync(pPager, 0, 0);
//刪除文件,釋放寫鎖
if( rc==SQLITE_OK ){
rc = pager_unwritelock(pPager);
pPager->dbSize = -1;
}
return rc;
}
//對數據庫加read lock,刪除日志文件
static int pager_unwritelock(Pager *pPager){
PgHdr *pPg;
int rc;
assert( !MEMDB );
if( pPager->state<PAGER_RESERVED ){
return SQLITE_OK;
}
sqlite3pager_stmt_commit(pPager);
if( pPager->stmtOpen ){
sqlite3OsClose(&pPager->stfd);
pPager->stmtOpen = 0;
}
if( pPager->journalOpen ){
//關閉日志文件
sqlite3OsClose(&pPager->jfd);
pPager->journalOpen = 0;
//刪除日志文件
sqlite3OsDelete(pPager->zJournal);
sqliteFree( pPager->aInJournal );
pPager->aInJournal = 0;
for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
pPg->inJournal = 0;
pPg->dirty = 0;
pPg->needSync = 0;
#ifdef SQLITE_CHECK_PAGES
pPg->pageHash = pager_pagehash(pPg);
#endif
}
pPager->pDirty = 0;
pPager->dirtyCache = 0;
pPager->nRec = 0;
}else{
assert( pPager->aInJournal==0 );
assert( pPager->dirtyCache==0 || pPager->useJournal==0 );
}
//釋放寫鎖,加讀鎖
rc = sqlite3OsUnlock(pPager->fd, SHARED_LOCK);
pPager->state = PAGER_SHARED;
pPager->origDbSize = 0;
pPager->setMaster = 0;
pPager->needSync = 0;
pPager->pFirstSynced = pPager->pFirst;
return rc;
}
~~~
下圖可進一步描述該過程:

最后來看看sqlite3BtreeSync()和sqlite3BtreeCommit()是如何被調用的。
一般來說,事務提交方式為自動提交的話,在虛擬機中的OP_Halt指令實現提交事務,相關代碼如下:
~~~
//虛擬機停機指令
case OP_Halt: { /* no-push */
p->pTos = pTos;
p->rc = pOp->p1;
p->pc = pc;
p->errorAction = pOp->p2;
if( pOp->p3 ){
sqlite3SetString(&p->zErrMsg, pOp->p3, (char*)0);
}
//設置虛擬機狀態SQLITE_MAGIC_RUN 為 SQLITE_MAGIC_HALT,
//并提交事務
rc = sqlite3VdbeHalt(p);
assert( rc==SQLITE_BUSY || rc==SQLITE_OK );
if( rc==SQLITE_BUSY ){
p->rc = SQLITE_BUSY;
return SQLITE_BUSY;
}
return p->rc ? SQLITE_ERROR : SQLITE_DONE;
}
~~~
//當虛擬機要停機時,調用該函數,如果VDBE改變了數據庫且為自動
//提交模式,則提交這些改變
~~~
int sqlite3VdbeHalt(Vdbe *p){
sqlite3 *db = p->db;
int i;
int (*xFunc)(Btree *pBt) = 0; /* Function to call on each btree backend */
int isSpecialError; /* Set to true if SQLITE_NOMEM or IOERR */
/* This function contains the logic that determines if a statement or
** transaction will be committed or rolled back as a result of the
** execution of this virtual machine.
**
** Special errors:
**
** If an SQLITE_NOMEM error has occured in a statement that writes to
** the database, then either a statement or transaction must be rolled
** back to ensure the tree-structures are in a consistent state. A
** statement transaction is rolled back if one is open, otherwise the
** entire transaction must be rolled back.
**
** If an SQLITE_IOERR error has occured in a statement that writes to
** the database, then the entire transaction must be rolled back. The
** I/O error may have caused garbage to be written to the journal
** file. Were the transaction to continue and eventually be rolled
** back that garbage might end up in the database file.
**
** In both of the above cases, the Vdbe.errorAction variable is
** ignored. If the sqlite3.autoCommit flag is false and a transaction
** is rolled back, it will be set to true.
**
** Other errors:
**
** No error:
**
*/
if( sqlite3MallocFailed() ){
p->rc = SQLITE_NOMEM;
}
if( p->magic!=VDBE_MAGIC_RUN ){
/* Already halted. Nothing to do. */
assert( p->magic==VDBE_MAGIC_HALT );
return SQLITE_OK;
}
//釋放虛擬機中所有的游標
closeAllCursors(p);
checkActiveVdbeCnt(db);
/* No commit or rollback needed if the program never started */
if( p->pc>=0 ){
/* Check for one of the special errors - SQLITE_NOMEM or SQLITE_IOERR */
isSpecialError = ((p->rc==SQLITE_NOMEM || p->rc==SQLITE_IOERR)?1:0);
if( isSpecialError ){
/* This loop does static analysis of the query to see which of the
** following three categories it falls into:
**
** Read-only
** Query with statement journal
** Query without statement journal
**
** We could do something more elegant than this static analysis (i.e.
** store the type of query as part of the compliation phase), but
** handling malloc() or IO failure is a fairly obscure edge case so
** this is probably easier. Todo: Might be an opportunity to reduce
** code size a very small amount though
*/
int isReadOnly = 1;
int isStatement = 0;
assert(p->aOp || p->nOp==0);
for(i=0; i<p->nOp; i++){
switch( p->aOp[i].opcode ){
case OP_Transaction:
isReadOnly = 0;
break;
case OP_Statement:
isStatement = 1;
break;
}
}
/* If the query was read-only, we need do no rollback at all. Otherwise,
** proceed with the special handling.
*/
if( !isReadOnly ){
if( p->rc==SQLITE_NOMEM && isStatement ){
xFunc = sqlite3BtreeRollbackStmt;
}else{
/* We are forced to roll back the active transaction. Before doing
** so, abort any other statements this handle currently has active.
*/
sqlite3AbortOtherActiveVdbes(db, p);
sqlite3RollbackAll(db);
db->autoCommit = 1;
}
}
}
/* If the auto-commit flag is set and this is the only active vdbe, then
** we do either a commit or rollback of the current transaction.
**
** Note: This block also runs if one of the special errors handled
** above has occured.
*/
//如果自動提交事務,則提交事務
if( db->autoCommit && db->activeVdbeCnt==1 ){
if( p->rc==SQLITE_OK || (p->errorAction==OE_Fail && !isSpecialError) ){
/* The auto-commit flag is true, and the vdbe program was
** successful or hit an 'OR FAIL' constraint. This means a commit
** is required.
*/
//提交事務
int rc = vdbeCommit(db);
if( rc==SQLITE_BUSY ){
return SQLITE_BUSY;
}else if( rc!=SQLITE_OK ){
p->rc = rc;
sqlite3RollbackAll(db);
}else{
sqlite3CommitInternalChanges(db);
}
}else{
sqlite3RollbackAll(db);
}
}else if( !xFunc ){
if( p->rc==SQLITE_OK || p->errorAction==OE_Fail ){
xFunc = sqlite3BtreeCommitStmt;
}else if( p->errorAction==OE_Abort ){
xFunc = sqlite3BtreeRollbackStmt;
}else{
sqlite3AbortOtherActiveVdbes(db, p);
sqlite3RollbackAll(db);
db->autoCommit = 1;
}
}
/* If xFunc is not NULL, then it is one of sqlite3BtreeRollbackStmt or
** sqlite3BtreeCommitStmt. Call it once on each backend. If an error occurs
** and the return code is still SQLITE_OK, set the return code to the new
** error value.
*/
assert(!xFunc ||
xFunc==sqlite3BtreeCommitStmt ||
xFunc==sqlite3BtreeRollbackStmt
);
for(i=0; xFunc && i<db->nDb; i++){
int rc;
Btree *pBt = db->aDb[i].pBt;
if( pBt ){
rc = xFunc(pBt);
if( rc && (p->rc==SQLITE_OK || p->rc==SQLITE_CONSTRAINT) ){
p->rc = rc;
sqlite3SetString(&p->zErrMsg, 0);
}
}
}
/* If this was an INSERT, UPDATE or DELETE and the statement was committed,
** set the change counter.
*/
if( p->changeCntOn && p->pc>=0 ){
if( !xFunc || xFunc==sqlite3BtreeCommitStmt ){
sqlite3VdbeSetChanges(db, p->nChange);
}else{
sqlite3VdbeSetChanges(db, 0);
}
p->nChange = 0;
}
/* Rollback or commit any schema changes that occurred. */
if( p->rc!=SQLITE_OK && db->flags&SQLITE_InternChanges ){
sqlite3ResetInternalSchema(db, 0);
db->flags = (db->flags | SQLITE_InternChanges);
}
}
/* We have successfully halted and closed the VM. Record this fact. */
if( p->pc>=0 ){
db->activeVdbeCnt--;
}
p->magic = VDBE_MAGIC_HALT;
checkActiveVdbeCnt(db);
return SQLITE_OK;
}
~~~
~~~
//提交事務,主要調用:
//sqlite3BtreeSync()---同步btree, sqlite3BtreeCommit()---提交事務
static int vdbeCommit(sqlite3 *db){
int i;
int nTrans = 0; /* Number of databases with an active write-transaction */
int rc = SQLITE_OK;
int needXcommit = 0;
for(i=0; i<db->nDb; i++){
Btree *pBt = db->aDb[i].pBt;
if( pBt && sqlite3BtreeIsInTrans(pBt) ){
needXcommit = 1;
if( i!=1 ) nTrans++;
}
}
/* If there are any write-transactions at all, invoke the commit hook */
if( needXcommit && db->xCommitCallback ){
sqlite3SafetyOff(db);
rc = db->xCommitCallback(db->pCommitArg);
sqlite3SafetyOn(db);
if( rc ){
return SQLITE_CONSTRAINT;
}
}
/* The simple case - no more than one database file (not counting the
** TEMP database) has a transaction active. There is no need for the
** master-journal.
**
** If the return value of sqlite3BtreeGetFilename() is a zero length
** string, it means the main database is :memory:. In that case we do
** not support atomic multi-file commits, so use the simple case then
** too.
*/
//簡單的情況,只有一個數據庫文件,不需要master-journal
if( 0==strlen(sqlite3BtreeGetFilename(db->aDb[0].pBt)) || nTrans<=1 ){
for(i=0; rc==SQLITE_OK && i<db->nDb; i++){
Btree *pBt = db->aDb[i].pBt;
if( pBt ){
//同步btree
rc = sqlite3BtreeSync(pBt, 0);
}
}
/* Do the commit only if all databases successfully synced */
//commite事務
if( rc==SQLITE_OK ){
for(i=0; i<db->nDb; i++){
Btree *pBt = db->aDb[i].pBt;
if( pBt ){
sqlite3BtreeCommit(pBt);
}
}
}
}
/* The complex case - There is a multi-file write-transaction active.
** This requires a master journal file to ensure the transaction is
** committed atomicly.
*/
#ifndef SQLITE_OMIT_DISKIO
else{
int needSync = 0;
char *zMaster = 0; /* File-name for the master journal */
char const *zMainFile = sqlite3BtreeGetFilename(db->aDb[0].pBt);
OsFile *master = 0;
/* Select a master journal file name */
do {
u32 random;
sqliteFree(zMaster);
sqlite3Randomness(sizeof(random), &random);
zMaster = sqlite3MPrintf("%s-mj%08X", zMainFile, random&0x7fffffff);
if( !zMaster ){
return SQLITE_NOMEM;
}
}while( sqlite3OsFileExists(zMaster) );
/* Open the master journal. */
rc = sqlite3OsOpenExclusive(zMaster, &master, 0);
if( rc!=SQLITE_OK ){
sqliteFree(zMaster);
return rc;
}
/* Write the name of each database file in the transaction into the new
** master journal file. If an error occurs at this point close
** and delete the master journal file. All the individual journal files
** still have 'null' as the master journal pointer, so they will roll
** back independently if a failure occurs.
*/
for(i=0; i<db->nDb; i++){
Btree *pBt = db->aDb[i].pBt;
if( i==1 ) continue; /* Ignore the TEMP database */
if( pBt && sqlite3BtreeIsInTrans(pBt) ){
char const *zFile = sqlite3BtreeGetJournalname(pBt);
if( zFile[0]==0 ) continue; /* Ignore :memory: databases */
if( !needSync && !sqlite3BtreeSyncDisabled(pBt) ){
needSync = 1;
}
rc = sqlite3OsWrite(master, zFile, strlen(zFile)+1);
if( rc!=SQLITE_OK ){
sqlite3OsClose(&master);
sqlite3OsDelete(zMaster);
sqliteFree(zMaster);
return rc;
}
}
}
/* Sync the master journal file. Before doing this, open the directory
** the master journal file is store in so that it gets synced too.
*/
zMainFile = sqlite3BtreeGetDirname(db->aDb[0].pBt);
rc = sqlite3OsOpenDirectory(master, zMainFile);
if( rc!=SQLITE_OK ||
(needSync && (rc=sqlite3OsSync(master,0))!=SQLITE_OK) ){
sqlite3OsClose(&master);
sqlite3OsDelete(zMaster);
sqliteFree(zMaster);
return rc;
}
/* Sync all the db files involved in the transaction. The same call
** sets the master journal pointer in each individual journal. If
** an error occurs here, do not delete the master journal file.
**
** If the error occurs during the first call to sqlite3BtreeSync(),
** then there is a chance that the master journal file will be
** orphaned. But we cannot delete it, in case the master journal
** file name was written into the journal file before the failure
** occured.
*/
for(i=0; i<db->nDb; i++){
Btree *pBt = db->aDb[i].pBt;
if( pBt && sqlite3BtreeIsInTrans(pBt) ){
rc = sqlite3BtreeSync(pBt, zMaster);
if( rc!=SQLITE_OK ){
sqlite3OsClose(&master);
sqliteFree(zMaster);
return rc;
}
}
}
sqlite3OsClose(&master);
/* Delete the master journal file. This commits the transaction. After
** doing this the directory is synced again before any individual
** transaction files are deleted.
*/
rc = sqlite3OsDelete(zMaster);
assert( rc==SQLITE_OK );
sqliteFree(zMaster);
zMaster = 0;
rc = sqlite3OsSyncDirectory(zMainFile);
if( rc!=SQLITE_OK ){
/* This is not good. The master journal file has been deleted, but
** the directory sync failed. There is no completely safe course of
** action from here. The individual journals contain the name of the
** master journal file, but there is no way of knowing if that
** master journal exists now or if it will exist after the operating
** system crash that may follow the fsync() failure.
*/
return rc;
}
/* All files and directories have already been synced, so the following
** calls to sqlite3BtreeCommit() are only closing files and deleting
** journals. If something goes wrong while this is happening we don't
** really care. The integrity of the transaction is already guaranteed,
** but some stray 'cold' journals may be lying around. Returning an
** error code won't help matters.
*/
for(i=0; i<db->nDb; i++){
Btree *pBt = db->aDb[i].pBt;
if( pBt ){
sqlite3BtreeCommit(pBt);
}
}
}
#endif
return rc;
}
~~~