Mongodb源碼分析--內(nèi)存文件映射(MMAP)
在Mongodb中,其使用了操作系統(tǒng)底層提供的內(nèi)存映射機制,即MMAP。MMAP可以把磁盤文件的一部分或全部內(nèi)容直接映射到內(nèi)存,這樣文件中的信息位置就會在內(nèi)存中有對應(yīng)的地址空間,這時對文件的讀寫可以直接用指針來做,而不需要read/write函數(shù)了。同時操作系統(tǒng)會將數(shù)據(jù)刷新保存到磁盤上。如下圖:
鑒于linux,window系統(tǒng)為mmap所提供的API大同小異(見下圖)。這里僅以mongodb對window系統(tǒng)的mmap調(diào)用機制為例,來說明一下其具體的實現(xiàn)方式,以及在mongodb啟動時,客戶端提交查詢和插入操作請求時mongodb的mmap執(zhí)行流程。
上面類圖中:
- MongoFile:定義了mongo文件對象常用操作,包括創(chuàng)建,關(guān)閉,設(shè)置名稱,flushAll,獲取MongoFile文件總尺寸等。
- MMF: 一個類型定義,其聲明:typedef MemoryMappedFile MMF;
- MongoMMF:為了便于journaling/durability操作,對MemoryMappedFile進行了一些封裝(特別是對private views )
下面著重看一下windows提供的mmap的常用API:
- MapViewOfFile(): 把文件數(shù)據(jù)映射到進程的地址空間
- CreateFileMapping() : 創(chuàng)建一個新的文件映射內(nèi)核對象
- FlushViewOfFile(): 強制系統(tǒng)將內(nèi)存中修改過的數(shù)據(jù)重新寫入磁盤映像,從而可以確保所有的數(shù)據(jù)更新能及時保存到磁盤
- CloseHandle(): 關(guān)閉文件映射對象和文件對象
- MapViewOfFileEx(): 將文件映射到指定的進程地址空間
參數(shù)說明:
- MapViewOfFile(
- __in HANDLE hFileMappingObject, /*hFileMappingObject是共享文件對象*/
- __in DWORD dwDesiredAccess, /*dwDesiredAccess是文件共享屬性*/
- __in DWORD dwFileOffsetHigh, /*dwFileOffsetHigh是文件共享區(qū)的偏移地址*/
- __in DWORD dwFileOffsetLow, /*dwFileOffsetLow是文件共享區(qū)的偏移地址*/
- __in SIZE_T dwNumberOfBytesToMap /*dwNumberOfBytesToMap是共享數(shù)據(jù)長度*/
- );
- //winbase.h
- CreateFileMappingW(
- __in HANDLE hFile, /*hFile是創(chuàng)建共享文件的句柄*/
- __in_opt LPSECURITY_ATTRIBUTES lpFileMappingAttributes, /*lpFileMappingAttributes是文件共享的屬性*/
- __in DWORD flProtect, /*flProtect是當(dāng)文件映射時讀寫文件的屬性*/
- __in DWORD dwMaximumSizeHigh, /*是文件共享的大小高位字節(jié)*/
- __in DWORD dwMaximumSizeLow, /*是文件共享的大小低位字節(jié)*/
- __in_opt LPCWSTR lpName /*lpName是共享文件對象名稱*/
- );
- #ifdef UNICODE
- #define CreateFileMapping CreateFileMappingW
- #else
- #define CreateFileMapping CreateFileMappingA
- #endif // !UNICODE
- FlushViewOfFile(
- __in LPCVOID lpBaseAddress, /*內(nèi)存映射文件中的視圖的一個字節(jié)的地址*/
- __in SIZE_T dwNumberOfBytesToFlush /*想要刷新的字節(jié)數(shù)*/
- );
- MapViewOfFileEx(
- __in HANDLE hFileMappingObject, /*共享文件對象*/
- __in DWORD dwDesiredAccess, /*文件共享屬性*/
- __in DWORD dwFileOffsetHigh, /*文件共享區(qū)的偏移地址*/
- __in DWORD dwFileOffsetLow, /*文件共享區(qū)的偏移地址*/
- __in SIZE_T dwNumberOfBytesToMap /*共享數(shù)據(jù)長度*/
- __in_opt LPVOID lpBaseAddress /*指定映射文件映射對象的地址。如這個地址處沒有足夠的內(nèi)存空間,
- 那么對MapViewOfFileEx的調(diào)用會失效*/
- );
下面我們看一下mongodb如何使用上述API,來實現(xiàn)windows環(huán)境下對mongofile進行mmap操作的.
- //mmap_win.cpp
- mutex mapViewMutex("mapView");//聲明mapView的互斥體(mutex)對象
- ourbitset writable;
- /** unmapping 通知,以便清空 writable bits */
- void MemoryMappedFile::clearWritableBits(void *p) {
- for( unsigned i = ((size_t)p)/ChunkSize; i <= (((size_t)p)+len)/ChunkSize; i++ ) {
- writable.clear(i);
- assert( !writable.get(i) );
- }
- }
- MemoryMappedFile::MemoryMappedFile()
- : _flushMutex(new mutex("flushMutex")) {
- fd = 0;
- maphandle = 0;
- len = 0;
- created();
- }
- //關(guān)閉文件MemoryMappedFile
- void MemoryMappedFile::close() {
- for( vector<void*>::iterator i = views.begin(); i != views.end(); i++ ) {
- clearWritableBits(*i);
- UnmapViewOfFile(*i);
- }
- views.clear();
- if ( maphandle )
- CloseHandle(maphandle);//關(guān)閉文件映射對象和文件對象
- maphandle = 0;
- if ( fd )
- CloseHandle(fd);//關(guān)閉文件映射對象和文件對象
- fd = 0;
- }
- unsigned long long mapped = 0;
- //創(chuàng)建只讀map
- void* MemoryMappedFile::createReadOnlyMap() {
- assert( maphandle );
- scoped_lock lk(mapViewMutex);
- void *p = MapViewOfFile(maphandle, FILE_MAP_READ, /*f ofs hi*/0, /*f ofs lo*/ 0, /*dwNumberOfBytesToMap 0 means to eof*/0);
- if ( p == 0 ) {
- DWORD e = GetLastError();
- log() << "FILE_MAP_READ MapViewOfFile failed " << filename() << " " << errnoWithDescription(e) << endl;
- }
- else {
- views.push_back(p);
- }
- return p;
- }
- //創(chuàng)建指定名稱和大小的MapViewOfFile
- void* MemoryMappedFile::map(const char *filenameIn, unsigned long long &length, int options) {
- assert( fd == 0 && len == 0 ); // 僅能打開一次
- setFilename(filenameIn);
- /* big hack here: Babble uses db names with colons. doesn't seem to work on windows. temporary perhaps. */
- char filename[256];
- strncpy(filename, filenameIn, 255);
- filename[255] = 0;
- {
- size_t len = strlen( filename );
- for ( size_t i=len-1; i>=0; i-- ) {
- if ( filename[i] == '/' ||
- filename[i] == '\\' )
- break;
- if ( filename[i] == ':' )
- filename[i] = '_';
- }
- }
- updateLength( filename, length );//如果指定文件已存在,則用已存在的文件長度更新length值
- {
- DWORD createOptions = FILE_ATTRIBUTE_NORMAL;
- if ( options & SEQUENTIAL )
- createOptions |= FILE_FLAG_SEQUENTIAL_SCAN;//針對連續(xù)訪問對文件緩沖進行優(yōu)化選項
- DWORD rw = GENERIC_READ | GENERIC_WRITE;//普通讀/寫
- fd = CreateFile(//創(chuàng)建相關(guān)文件
- toNativeString(filename).c_str(),//創(chuàng)建的文件名稱
- rw, // desired access
- FILE_SHARE_WRITE | FILE_SHARE_READ, // share mode
- NULL, // security
- OPEN_ALWAYS, // create disposition
- createOptions , // flags
- NULL); // hTempl
- if ( fd == INVALID_HANDLE_VALUE ) {
- DWORD e = GetLastError();
- log() << "Create/OpenFile failed " << filename << " errno:" << e << endl;
- return 0;
- }
- }
- mapped += length;
- {
- //采用“讀寫文件數(shù)據(jù)”方式的頁面保護屬性
- DWORD flProtect = PAGE_READWRITE;
- //創(chuàng)建一個文件映射內(nèi)核對象并告訴系統(tǒng)文件的尺寸以及訪問文件的方式
- maphandle = CreateFileMapping(fd, NULL, flProtect,
- length >> 32 /*maxsizehigh*/,
- (unsigned) length /*maxsizelow*/,
- NULL/*lpName*/);
- if ( maphandle == NULL ) {
- // 先獲取操作信息, 因為下面的log()要刪除lasterror信息
- DWORD e = GetLastError();
- log() << "CreateFileMapping failed " << filename << ' ' << errnoWithDescription(e) << endl;
- close();
- return 0;
- }
- }
- void *view = 0;
- {
- scoped_lock lk(mapViewMutex);
- DWORD access = (options&READONLY)? FILE_MAP_READ : FILE_MAP_ALL_ACCESS;
- //把文件數(shù)據(jù)映射到進程的地址空間
- view = MapViewOfFile(maphandle, access, /*f ofs hi*/0, /*f ofs lo*/ 0, /*dwNumberOfBytesToMap 0 means to eof*/0);
- }
- if ( view == 0 ) {
- DWORD e = GetLastError();
- log() << "MapViewOfFile failed " << filename << " " << errnoWithDescription(e) << endl;
- close();
- }
- else {
- views.push_back(view);
- }
- len = length;
- return view;
- }
- class WindowsFlushable : public MemoryMappedFile::Flushable {
- public:
- WindowsFlushable( void * view , HANDLE fd , string filename , boost::shared_ptr<mutex> flushMutex )
- : _view(view) , _fd(fd) , _filename(filename) , _flushMutex(flushMutex)
- {}
- void flush() {
- if (!_view || !_fd)
- return;
- scoped_lock lk(*_flushMutex);
- // 強制系統(tǒng)將內(nèi)存中修改過的數(shù)據(jù)重新寫入磁盤映像,從而可以確保所有的數(shù)據(jù)更新能及時保存到磁盤。
- bool success = FlushViewOfFile(_view, 0 /*0表示全部mapping*/);
- if (!success) {
- int err = GetLastError();
- out() << "FlushViewOfFile failed " << err << " file: " << _filename << endl;
- }
- success = FlushFileBuffers(_fd);//刷新內(nèi)部文件緩沖區(qū)的數(shù)據(jù)刷到磁盤上
- if (!success) {
- int err = GetLastError();
- out() << "FlushFileBuffers failed " << err << " file: " << _filename << endl;
- }
- }
- void * _view;
- HANDLE _fd;
- string _filename;
- boost::shared_ptr<mutex> _flushMutex;
- };
- //是否進行異步的flush操作(該操作會將修改過的數(shù)據(jù)部分或全部重新寫入磁盤映像)
- void MemoryMappedFile::flush(bool sync) {
- uassert(13056, "Async flushing not supported on windows", sync);//windows系統(tǒng)不支持異步flush
- if( !views.empty() ) {
- WindowsFlushable f( views[0] , fd , filename() , _flushMutex);
- f.flush();
- }
- }
- //預(yù)先刷數(shù)據(jù)操作,該方法確保這個對象是可以執(zhí)行flush()操作,以便在調(diào)用該方法之后執(zhí)行flush操作.
- //參見mmap.cpp flushAll操作
- MemoryMappedFile::Flushable * MemoryMappedFile::prepareFlush() {
- return new WindowsFlushable( views.empty() ? 0 : views[0] , fd , filename() , _flushMutex );
- }
- void MemoryMappedFile::_lock() {}
- void MemoryMappedFile::_unlock() {}
上面的代碼比較簡單,大家看一下注釋就可以了,下面看一下mmf對于上面的MemoryMappedFile類實現(xiàn)是如何封裝的,因為mmf會在journaling/durability這類場景下使用PrivateMap():
- //mongommf.cpp文件
- //構(gòu)造PrivateMap
- void* MemoryMappedFile::createPrivateMap() {
- assert( maphandle );
- scoped_lock lk(mapViewMutex);
- //void *p = mapaligned(maphandle, len);
- void *p = MapViewOfFile(maphandle, FILE_MAP_READ, 0, 0, 0);
- if ( p == 0 ) {
- DWORD e = GetLastError();
- log() << "createPrivateMap failed " << filename() << " " << errnoWithDescription(e) << endl;
- }
- else {
- clearWritableBits(p);
- views.push_back(p);
- }
- return p;
- }
- //重新映射PrivateView
- void* MemoryMappedFile::remapPrivateView(void *oldPrivateAddr) {
- dbMutex.assertWriteLocked(); // short window where we are unmapped so must be exclusive
- // mapViewMutex確保在重新映射時獲得相同的地址
- scoped_lock lk(mapViewMutex);
- //清空 writable bits
- clearWritableBits(oldPrivateAddr);
- //從進程的地址空間(oldPrivateAddr)撤消文件數(shù)據(jù)的映像
- if( !UnmapViewOfFile(oldPrivateAddr) ) {
- DWORD e = GetLastError();
- log() << "UnMapViewOfFile failed " << filename() << ' ' << errnoWithDescription(e) << endl;
- assert(false);
- }
- // 將文件映射到指定的進程地址空間
- void *p = MapViewOfFileEx(maphandle, FILE_MAP_READ, 0, 0,
- /*dwNumberOfBytesToMap 0 means to eof*/0 /*len*/,
- oldPrivateAddr);
- if ( p == 0 ) {
- DWORD e = GetLastError();
- log() << "MapViewOfFileEx failed " << filename() << " " << errnoWithDescription(e) << endl;
- assert(p);
- }
- assert(p == oldPrivateAddr);
- return p;
- }
- #endif
- //重新映射PrivateView
- void MongoMMF::remapThePrivateView() {
- assert( cmdLine.dur );
- // todo 1.9 : it turns out we require that we always remap to the same address.
- // so the remove / add isn't necessary and can be removed
- privateViews.remove(_view_private);
- _view_private = remapPrivateView(_view_private);
- privateViews.add(_view_private, this);
- }
- ......
- //打開指定的文件并執(zhí)行mmap操作
- bool MongoMMF::open(string fname, bool sequentialHint) {
- setPath(fname);
- _view_write = mapWithOptions(fname.c_str(), sequentialHint ? SEQUENTIAL : 0);
- return finishOpening();
- }
- //創(chuàng)建指定名稱的文件并執(zhí)行mmap操作
- bool MongoMMF::create(string fname, unsigned long long& len, bool sequentialHint) {
- setPath(fname);
- _view_write = map(fname.c_str(), len, sequentialHint ? SEQUENTIAL : 0);
- return finishOpening();
- }
- //創(chuàng)建PrivateMap并加載到privateViews集合中
- bool MongoMMF::finishOpening() {
- if( _view_write ) {
- if( cmdLine.dur ) {
- _view_private = createPrivateMap();
- if( _view_private == 0 ) {
- massert( 13636 , "createPrivateMap failed (look in log for error)" , false );
- }
- privateViews.add(_view_private, this); // note that testIntent builds use this, even though it points to view_write then...
- }
- else {
- _view_private = _view_write;
- }
- return true;
- }
- return false;
- }
- ......
- //從privateViews集合中移除當(dāng)前 _view_private,并關(guān)閉文件映射對象和文件對象
- void MongoMMF::close() {
- {
- if( cmdLine.dur && _view_write/*actually was opened*/ ) {
- if( debug )
- log() << "closingFileNotication:" << filename() << endl;
- dur::closingFileNotification();
- }
- privateViews.remove(_view_private);
- }
- _view_write = _view_private = 0;
- MemoryMappedFile::close();//關(guān)閉文件映射對象和文件對象
- }
mongodb完成了上面的工具類的聲明定義之后,就會在前臺使用這些類了,下面通過插入數(shù)據(jù)操作(之前主要流程我已在這篇文章中有所描述)過程中,對上面類的使用來進行闡述.
首先需要說明的是,如果是***在本地運行mongod,則不會在指定的數(shù)據(jù)庫目錄(dbpath 參數(shù))下生成數(shù)據(jù)庫文件,但如果有數(shù)據(jù)插入時,則會生成相應(yīng)文件,這里可以理解為生成文件的過程就是mmap的創(chuàng)建過程。
之前的文章中提到過,當(dāng)客戶端要插入記錄時,則系統(tǒng)會根據(jù)客戶端的操作枚舉信息來調(diào)用相應(yīng)的操作,這里它會執(zhí)行instance.cpp文件中的receivedInsert方法,并進而調(diào)用 pdfile.cpp 文件的 insert()函數(shù),而在該方法下有如下一段代碼:
- DiskLoc DataFileMgr::insert(const char *ns, const void *obuf, int len, bool god, const BSONElement &writeId, bool mayAddIndex) {
- ......
- NamespaceDetails *d = nsdetails(ns);//獲取ns的詳細(xì)信息
- if ( d == 0 ) {
- addNewNamespaceToCatalog(ns);//向system catalog添加新的名空間,它會再次調(diào)用當(dāng)前insert()方法
- // 創(chuàng)建***個數(shù)據(jù)庫文件,方法位于database.cpp
- cc().database()->allocExtent(ns, Extent::initialSize(len), false);
- ......
- }
上面的allocExtent方法用于分配Extent要求的磁盤空間,其中Extent用于記錄多個record記錄信息,而record就是數(shù)據(jù)庫中的一條記錄。這里可以將Extent看成是一個數(shù)據(jù)集合,但與我們通常所理解的"數(shù)據(jù)表"(datatable)有所差異,因為在同一個namespace下可以有一個或多個extent(可以不連續(xù)),extent之間是一個雙向鏈表結(jié)構(gòu),其通過cursor進行向前(forward)或反轉(zhuǎn)(reverse)的訪問。有關(guān)這些內(nèi)容,參見我之前寫的這篇文章。
言歸正傳,在上面的allocExtent方法中,會執(zhí)行pdfile.cpp中的如下方法:
- //pdfile.cpp
- Extent* MongoDataFile::createExtent(const char *ns, int approxSize, bool newCapped, int loops) {
- .....
- int ExtentSize = approxSize <= header()->unusedLength ? approxSize : header()->unusedLength;
- DiskLoc loc;
- if ( ExtentSize < Extent::minSize() ) {//判斷當(dāng)前ExtentSize的大小
- ......
- //addAFile方法位于 database.cpp
- return cc().database()->addAFile( 0, true )->createExtent(ns, approxSize, newCapped, loops+1);
- .....
- }
***在addAFile方法中,我們會看下如下代碼段:
- //database.cpp
- MongoDataFile* Database::addAFile( int sizeNeeded, bool preallocateNextFile ) {
- int n = (int) files.size();
- MongoDataFile *ret = getFile( n, sizeNeeded );//調(diào)用下面的getFile方法
- .....
- }
- //database.cpp
- MongoDataFile* Database::getFile( int n, int sizeNeeded , bool preallocateOnly) {
- ......
- namespaceIndex.init();
- .....
- }
- //namespace.cpp
- void NamespaceIndex::init() {
- ......
- unsigned long long len = 0;
- boost::filesystem::path nsPath = path();
- string pathString = nsPath.string();
- void *p = 0;
- if( MMF::exists(nsPath) ) {//使用本文前面提到的MMF類,判斷數(shù)據(jù)庫文件是否存在
- if( f.open(pathString, true) ) {//打開指定的文件并執(zhí)行mmap操作
- len = f.length();
- if ( len % (1024*1024) != 0 ) {
- log() << "bad .ns file: " << pathString << endl;
- uassert( 10079 , "bad .ns file length, cannot open database", len % (1024*1024) == 0 );
- }
- p = f.getView();//返回mapview
- }
- }
- else {//不存在
- // use lenForNewNsFiles, we are making a new database
- massert( 10343, "bad lenForNewNsFiles", lenForNewNsFiles >= 1024*1024 );
- maybeMkdir();//創(chuàng)建相應(yīng)目錄(如不存在)
- unsigned long long l = lenForNewNsFiles;
- if( f.create(pathString, l, true) ) {//創(chuàng)建指定名稱的文件并執(zhí)行mmap操作
- getDur().createdFile(pathString, l); // always a new file
- len = l;
- assert( len == lenForNewNsFiles );
- p = f.getView();//返回mapview
- }
- }
- ......
- }
下面用一張時序圖來大體回顧一下這***程:
在創(chuàng)建了該數(shù)據(jù)庫文件及相應(yīng)mmap操作之后,下面再重新啟動mongod時,系統(tǒng)會通過構(gòu)造client類的上下文對象 (context)方法來最終調(diào)用namespaceIndex.init()方法,其時序圖如下,大家可以通過調(diào)試源碼來難證這***程:
好了,今天的內(nèi)容到這里就告一段落。
參考鏈接:
- http://www.cnblogs.com/daizhj/archive/2011/03/30/1999699.html
- http://en.wikipedia.org/wiki/Mmap
- http://linux.about.com/library/cmd/blcmdl2_mmap.htm
- http://msdn.microsoft.com/en-us/library/aa366761.aspx
- http://hi.baidu.com/%B2%A4%B2%CB%B1%F9%B1%F9/blog/item/f6e6fb2561c0136a35a80f70.html
原文鏈接:http://www.cnblogs.com/daizhj/archive/2011/04/25/mongos_mmap_source_code.html
【編輯推薦】
- 走進MongoDB的世界 展開MongoDB的學(xué)習(xí)之旅
- 淺析Mongodb源碼之游標(biāo)Cursor
- 野心勃勃的NoSQL新貴 MongoDB應(yīng)用實戰(zhàn)
- MongoDB與CouchDB全方位對比
- MongoDB1.8發(fā)布,分布式文檔數(shù)據(jù)庫