Azureの3つの仮想マシンでmongodbクラスターを実行しています。私は自分でクラスターをセットアップし、これを約1年間使用しています。数か月前、プライマリでmongodbのクラッシュが発生しました。幸い、セカンダリが引き継ぎ、プラットフォームに害はありません。問題は、それ以降、プライマリが約2週間ごとにクラッシュすることです。
これらのサーバーはカスタム監視機能を備えたGrafanaにあり、クラッシュの直前に「I/Oの実行に費やされた時間」は大きく変動し、通常よりも高くピークに達しています。その少し後に、サーバーが次のエラーでクラッシュします。
2017-08-27T23:29:31.537+0000 F - [NetworkInterfaceASIO-BGSync-0] std::exception::what(): Resource temporarily unavailable
Actual exception type: std::system_error
0x1556b32 0x1555e42 0x1ce5506 0x1ce5551 0x14e0a35 0x14e1258 0x12d996e 0x12da16e 0x12da8a8 0x12cd06c 0x12c2a38 0x12c3f9a 0x12c4cd9 0x12c0f85 0x128c858 0x129ee53 0x129f48d 0x1576824 0x1576a21 0x12b75fb 0x1d00200 0x7fa0454f56ba 0x7fa04522b3dd
----- BEGIN BACKTRACE -----
{"backtrace":[{"b":"400000","o":"1156B32","s":"_ZN5mongo15printStackTraceERSo"},{"b":"400000","o":"1155E42"},{"b":"400000","o":"18E5506","s":"_ZN10__cxxabiv111__terminateEPFvvE"},{"b":"400000","o":"18E5551"},{"b":"400000","o":"10E0A35","s":"_ZN5mongo10ThreadPool25_startWorkerThread_inlockEv"},{"b":"400000","o":"10E1258","s":"_ZN5mongo10ThreadPool8scheduleESt8functionIFvvEE"},{"b":"400000","o":"ED996E","s":"_ZN5mongo8executor22ThreadPoolTaskExecutor23scheduleIntoPool_inlockEPNSt7__cxx114listISt10shared_ptrINS1_13CallbackStateEESaIS6_EEERKSt14_List_iteratorIS6_ESD_St11unique_lockISt5mutexE"},{"b":"400000","o":"EDA16E","s":"_ZN5mongo8executor22ThreadPoolTaskExecutor23scheduleIntoPool_inlockEPNSt7__cxx114listISt10shared_ptrINS1_13CallbackStateEESaIS6_EEERKSt14_List_iteratorIS6_ESt11unique_lockISt5mutexE"},{"b":"400000","o":"EDA8A8"},{"b":"400000","o":"ECD06C","s":"_ZN5mongo8executor20NetworkInterfaceASIO7AsyncOp6finishERKNS_10StatusWithINS0_21RemoteCommandResponseEEE"},{"b":"400000","o":"EC2A38","s":"_ZN5mongo8executor20NetworkInterfaceASIO18_completeOperationEPNS1_7AsyncOpERKNS_10StatusWithINS0_21RemoteCommandResponseEEE"},{"b":"400000","o":"EC3F9A","s":"_ZN5mongo8executor20NetworkInterfaceASIO20_completedOpCallbackEPNS1_7AsyncOpE"},{"b":"400000","o":"EC4CD9"},{"b":"400000","o":"EC0F85"},{"b":"400000","o":"E8C858","s":"_ZN4asio6detail14strand_service8dispatchINS0_7binder2IRSt8functionIFvSt10error_codemEES5_mEEEEvRPNS1_11strand_implERT_"},{"b":"400000","o":"E9EE53","s":"_ZN4asio6detail14strand_service8dispatchINS0_17rewrapped_handlerINS0_7binder2INS0_7read_opINS_19basic_stream_socketINS_2ip3tcpENS_21stream_socket_serviceIS8_EEEENS_17mutable_buffers_1ENS0_14transfer_all_tENS0_15wrapped_handlerINS_10io_service6strandESt8functionIFvSt10error_codemEENS0_26is_continuation_if_runningEEEEESI_mEESK_EEEEvRPNS1_11strand_implERT_"},{"b":"400000","o":"E9F48D","s":"_ZN4asio6detail23reactive_socket_recv_opINS_17mutable_buffers_1ENS0_7read_opINS_19basic_stream_socketINS_2ip3tcpENS_21stream_socket_serviceIS6_EEEES2_NS0_14transfer_all_tENS0_15wrapped_handlerINS_10io_service6strandESt8functionIFvSt10error_codemEENS0_26is_continuation_if_runningEEEEEE11do_completeEPvPNS0_19scheduler_operationERKSF_m"},{"b":"400000","o":"1176824","s":"_ZN4asio6detail9scheduler10do_run_oneERNS0_11scoped_lockINS0_11posix_mutexEEERNS0_21scheduler_thread_infoERKSt10error_code"},{"b":"400000","o":"1176A21","s":"_ZN4asio6detail9scheduler3runERSt10error_code"},{"b":"400000","o":"EB75FB"},{"b":"400000","o":"1900200"},{"b":"7FA0454EE000","o":"76BA"},{"b":"7FA045124000","o":"1073DD","s":"clone"}],"processInfo":{ "mongodbVersion" : "3.2.16", "gitVersion" : "056bf45128114e44c5358c7a8776fb582363e094", "compiledModules" : [], "uname" : { "sysname" : "Linux", "release" : "4.4.0-92-generic", "version" : "#115-Ubuntu SMP Thu Aug 10 09:04:33 UTC 2017", "machine" : "x86_64" }, "somap" : [ { "elfType" : 2, "b" : "400000", "buildId" : "B4C77D1B42936B23E28A2739927CB25274DB2D96" }, { "b" : "7FFE41517000", "elfType" : 3, "buildId" : "F23E2C79BCC8E97B12E1BB62A1BF196F8423FC40" }, { "b" : "7FA04647A000", "path" : "/lib/x86_64-linux-gnu/libssl.so.1.0.0", "elfType" : 3, "buildId" : "675F454AD6FD0B6CA2E41127C7B98079DA37F7B6" }, { "b" : "7FA046036000", "path" : "/lib/x86_64-linux-gnu/libcrypto.so.1.0.0", "elfType" : 3, "buildId" : "2DA08A7E5BF610030DD33B70DB951399626B7496" }, { "b" : "7FA045E2E000", "path" : "/lib/x86_64-linux-gnu/librt.so.1", "elfType" : 3, "buildId" : "F951C1E0765FCAE48F82CAFE35D1ADD36D6C9AF9" }, { "b" : "7FA045C2A000", "path" : "/lib/x86_64-linux-gnu/libdl.so.2", "elfType" : 3, "buildId" : "0FC788F0861846257B5F1773FBD438E95DFC1032" }, { "b" : "7FA045921000", "path" : "/lib/x86_64-linux-gnu/libm.so.6", "elfType" : 3, "buildId" : "FF7A33D389E756CA381A8189291A968EA5E1F4F8" }, { "b" : "7FA04570B000", "path" : "/lib/x86_64-linux-gnu/libgcc_s.so.1", "elfType" : 3, "buildId" : "68220AE2C65D65C1B6AAA12FA6765A6EC2F5F434" }, { "b" : "7FA0454EE000", "path" : "/lib/x86_64-linux-gnu/libpthread.so.0", "elfType" : 3, "buildId" : "27F189EF8DB8C3734C6A678E6EF3CB0B206D58B2" }, { "b" : "7FA045124000", "path" : "/lib/x86_64-linux-gnu/libc.so.6", "elfType" : 3, "buildId" : "088A6E00A1814622219F346B41E775B8DD46C518" }, { "b" : "7FA0466E3000", "path" : "/lib64/ld-linux-x86-64.so.2", "elfType" : 3, "buildId" : "9157F205547F0EB588E2AB1F2F120B74253A43EA" } ] }}
mongod(_ZN5mongo15printStackTraceERSo+0x32) [0x1556b32]
mongod(+0x1155E42) [0x1555e42]
mongod(_ZN10__cxxabiv111__terminateEPFvvE+0x6) [0x1ce5506]
mongod(+0x18E5551) [0x1ce5551]
mongod(_ZN5mongo10ThreadPool25_startWorkerThread_inlockEv+0x965) [0x14e0a35]
mongod(_ZN5mongo10ThreadPool8scheduleESt8functionIFvvEE+0x358) [0x14e1258]
mongod(_ZN5mongo8executor22ThreadPoolTaskExecutor23scheduleIntoPool_inlockEPNSt7__cxx114listISt10shared_ptrINS1_13CallbackStateEESaIS6_EEERKSt14_List_iteratorIS6_ESD_St11unique_lockISt5mutexE+0x1FE) [0x12d996e]
mongod(_ZN5mongo8executor22ThreadPoolTaskExecutor23scheduleIntoPool_inlockEPNSt7__cxx114listISt10shared_ptrINS1_13CallbackStateEESaIS6_EEERKSt14_List_iteratorIS6_ESt11unique_lockISt5mutexE+0x3E) [0x12da16e]
mongod(+0xEDA8A8) [0x12da8a8]
mongod(_ZN5mongo8executor20NetworkInterfaceASIO7AsyncOp6finishERKNS_10StatusWithINS0_21RemoteCommandResponseEEE+0x18C) [0x12cd06c]
mongod(_ZN5mongo8executor20NetworkInterfaceASIO18_completeOperationEPNS1_7AsyncOpERKNS_10StatusWithINS0_21RemoteCommandResponseEEE+0x378) [0x12c2a38]
mongod(_ZN5mongo8executor20NetworkInterfaceASIO20_completedOpCallbackEPNS1_7AsyncOpE+0x6A) [0x12c3f9a]
mongod(+0xEC4CD9) [0x12c4cd9]
mongod(+0xEC0F85) [0x12c0f85]
mongod(_ZN4asio6detail14strand_service8dispatchINS0_7binder2IRSt8functionIFvSt10error_codemEES5_mEEEEvRPNS1_11strand_implERT_+0x88) [0x128c858]
mongod(_ZN4asio6detail14strand_service8dispatchINS0_17rewrapped_handlerINS0_7binder2INS0_7read_opINS_19basic_stream_socketINS_2ip3tcpENS_21stream_socket_serviceIS8_EEEENS_17mutable_buffers_1ENS0_14transfer_all_tENS0_15wrapped_handlerINS_10io_service6strandESt8functionIFvSt10error_codemEENS0_26is_continuation_if_runningEEEEESI_mEESK_EEEEvRPNS1_11strand_implERT_+0x6D3) [0x129ee53]
mongod(_ZN4asio6detail23reactive_socket_recv_opINS_17mutable_buffers_1ENS0_7read_opINS_19basic_stream_socketINS_2ip3tcpENS_21stream_socket_serviceIS6_EEEES2_NS0_14transfer_all_tENS0_15wrapped_handlerINS_10io_service6strandESt8functionIFvSt10error_codemEENS0_26is_continuation_if_runningEEEEEE11do_completeEPvPNS0_19scheduler_operationERKSF_m+0x1ED) [0x129f48d]
mongod(_ZN4asio6detail9scheduler10do_run_oneERNS0_11scoped_lockINS0_11posix_mutexEEERNS0_21scheduler_thread_infoERKSt10error_code+0x304) [0x1576824]
mongod(_ZN4asio6detail9scheduler3runERSt10error_code+0xC1) [0x1576a21]
mongod(+0xEB75FB) [0x12b75fb]
mongod(+0x1900200) [0x1d00200]
libpthread.so.0(+0x76BA) [0x7fa0454f56ba]
libc.so.6(clone+0x6D) [0x7fa04522b3dd]
----- END BACKTRACE -----
2017-08-27T23:29:31.487+0000 I COMMAND [conn11335] command admin.$cmd command: isMaster { ismaster: true, client: { driver: { name: "nodejs", version: "2.2.24" }, os: { type: "Linux", name: "linux", architecture: "x64", version: "4.4.0-71-generic" }, platform: "Node.js v4.8.3, LE, mongodb-core: 2.1.8" } } keyUpdates:0 writeConflicts:0 numYields:0 reslen:342 locks:{} protocol:op_query 23099ms
このエラーをデバッグするためにどこから始めればよいのか正確にはわかりません。奇妙なことに、2つのセカンダリは、まったく同じ構成であっても、ダウンタイムが発生することはありません。
必要に応じて、私のmongo構成:
storage:
dbPath: /data
journal:
enabled: true
systemLog:
destination: file
logAppend: true
path: /var/log/mongodb/mongod.log
net:
port: 27017
bindIp: 0.0.0.0
security:
keyFile: /etc/mongo-keyfile
replication:
replSetName: rs0
oplogSizeMB: 150
各サーバーは、最新リリース(現在は3.2.16)のバージョン3.2のmongoを使用してAzureでUbuntu16.04を実行しています。
私たちの環境でもまったく同じことが起こりました。その時点で別の毎日のジョブが開始され、すべてのシステムメモリが消費され、MongoDBが同じエラー+スタックトレースでクラッシュすることがあることがわかりました。
Ubuntu 14.04LTSでMongoDB3.4.9を使用した3サーバーレプリカセットを実行しています。