連続音声認識。 SFSpeechRecognizerを使用（ios10-beta）

Question

続きを実行しようとしています。 iOS 10ベータ版でAVCaptureを使用した音声認識。 CMSampleBuffersを継続的に取得するためにcaptureOutput(...)をセットアップしています。これらのバッファーを、以前に次のようにセットアップしたSFSpeechAudioBufferRecognitionRequestに直接入れます。

... do some setup SFSpeechRecognizer.requestAuthorization { authStatus in if authStatus == SFSpeechRecognizerAuthorizationStatus.authorized { self.m_recognizer = SFSpeechRecognizer() self.m_recognRequest = SFSpeechAudioBufferRecognitionRequest() self.m_recognRequest?.shouldReportPartialResults = false self.m_isRecording = true } else { print("not authorized") } } .... do further setup func captureOutput(_ captureOutput: AVCaptureOutput!, didOutputSampleBuffer sampleBuffer: CMSampleBuffer!, from connection: AVCaptureConnection!) { if(!m_AV_initialized) { print("captureOutput(...): not initialized !") return } if(!m_isRecording) { return } let formatDesc = CMSampleBufferGetFormatDescription(sampleBuffer) let mediaType = CMFormatDescriptionGetMediaType(formatDesc!) if (mediaType == kCMMediaType_Audio) { // process audio here m_recognRequest?.appendAudioSampleBuffer(sampleBuffer) } return }

すべてが数秒間機能します。そうすると、captureOutputはもう呼び出されません。 appendAudioSampleBuffer（sampleBuffer）という行をコメントアウトすると、アプリが実行されている限り（予想どおり）captureOutputが呼び出されます。明らかに、サンプルバッファを音声認識エンジンに配置すると、それ以上の実行がブロックされます。しばらくすると利用可能なバッファが消費され、バッファを取得できなくなったためにプロセスが何らかの形で停止したのでしょうか？

最初の2秒間に記録されたすべてが正しい認識につながることに言及する必要があります。 Appleはベータ版のドキュメントにテキストを追加しなかったため。SFSpeechAudioBufferRecognitionRequest.endAudio（）の使用方法は？

誰かがここで何かを知っていますか？

ありがとう、クリス

Kim Truong · Answer

SpeakToMeサンプルをSwift音声認識WWDC開発者トークからのコードをObjective-Cに変換し、それは私のために働きました。Swiftについては、 https://developer.Apple。 com/videos/play/wwdc2016/509 / 、またはObjective-Cについては以下を参照してください。

- (void) viewDidAppear:(BOOL)animated { _recognizer = [[SFSpeechRecognizer alloc] initWithLocale:[NSLocale localeWithLocaleIdentifier:@"en-US"]]; [_recognizer setDelegate:self]; [SFSpeechRecognizer requestAuthorization:^(SFSpeechRecognizerAuthorizationStatus authStatus) { switch (authStatus) { case SFSpeechRecognizerAuthorizationStatusAuthorized: //User gave access to speech recognition NSLog(@"Authorized"); break; case SFSpeechRecognizerAuthorizationStatusDenied: //User denied access to speech recognition NSLog(@"SFSpeechRecognizerAuthorizationStatusDenied"); break; case SFSpeechRecognizerAuthorizationStatusRestricted: //Speech recognition restricted on this device NSLog(@"SFSpeechRecognizerAuthorizationStatusRestricted"); break; case SFSpeechRecognizerAuthorizationStatusNotDetermined: //Speech recognition not yet authorized break; default: NSLog(@"Default"); break; } }]; audioEngine = [[AVAudioEngine alloc] init]; _speechSynthesizer = [[AVSpeechSynthesizer alloc] init]; [_speechSynthesizer setDelegate:self]; } -(void)startRecording { [self clearLogs:nil]; NSError * outError; AVAudioSession *audioSession = [AVAudioSession sharedInstance]; [audioSession setCategory:AVAudioSessionCategoryRecord error:&outError]; [audioSession setMode:AVAudioSessionModeMeasurement error:&outError]; [audioSession setActive:true withOptions:AVAudioSessionSetActiveOptionNotifyOthersOnDeactivation error:&outError]; request2 = [[SFSpeechAudioBufferRecognitionRequest alloc] init]; inputNode = [audioEngine inputNode]; if (request2 == nil) { NSLog(@"Unable to created a SFSpeechAudioBufferRecognitionRequest object"); } if (inputNode == nil) { NSLog(@"Unable to created a inputNode object"); } request2.shouldReportPartialResults = true; _currentTask = [_recognizer recognitionTaskWithRequest:request2 delegate:self]; [inputNode installTapOnBus:0 bufferSize:4096 format:[inputNode outputFormatForBus:0] block:^(AVAudioPCMBuffer *buffer, AVAudioTime *when){ NSLog(@"Block tap!"); [request2 appendAudioPCMBuffer:buffer]; }]; [audioEngine prepare]; [audioEngine startAndReturnError:&outError]; NSLog(@"Error %@", outError); } - (void)speechRecognitionTask:(SFSpeechRecognitionTask *)task didFinishRecognition:(SFSpeechRecognitionResult *)result { NSLog(@"speechRecognitionTask:(SFSpeechRecognitionTask *)task didFinishRecognition"); NSString * translatedString = [[[result bestTranscription] formattedString] stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]]; [self log:translatedString]; if ([result isFinal]) { [audioEngine stop]; [inputNode removeTapOnBus:0]; _currentTask = nil; request2 = nil; } }

cube · Answer

SFSpeechRecognizerを継続的に使用することに成功しました。主なポイントは、AVCaptureSessionを使用してオーディオをキャプチャし、SpeechRecognizerに転送することです。申し訳ありませんが、私はSwiftが貧弱なので、ObjCバージョンだけです。

ここに私のサンプルコードがあります（いくつかの重要なマークが付いているUIコードを省略します）。

@interface ViewController ()<AVCaptureAudioDataOutputSampleBufferDelegate,SFSpeechRecognitionTaskDelegate> @property (nonatomic, strong) AVCaptureSession *capture; @property (nonatomic, strong) SFSpeechAudioBufferRecognitionRequest *speechRequest; @end @implementation ViewController - (void)startRecognizer { [SFSpeechRecognizer requestAuthorization:^(SFSpeechRecognizerAuthorizationStatus status) { if (status == SFSpeechRecognizerAuthorizationStatusAuthorized){ NSLocale *local =[[NSLocale alloc] initWithLocaleIdentifier:@"fr_FR"]; SFSpeechRecognizer *sf =[[SFSpeechRecognizer alloc] initWithLocale:local]; self.speechRequest = [[SFSpeechAudioBufferRecognitionRequest alloc] init]; [sf recognitionTaskWithRequest:self.speechRequest delegate:self]; // should call startCapture method in main queue or it may crash dispatch_async(dispatch_get_main_queue(), ^{ [self startCapture]; }); } }]; } - (void)endRecognizer { // END capture and END voice Reco // or Apple will terminate this task after 30000ms. [self endCapture]; [self.speechRequest endAudio]; } - (void)startCapture { NSError *error; self.capture = [[AVCaptureSession alloc] init]; AVCaptureDevice *audioDev = [AVCaptureDevice defaultDeviceWithMediaType:AVMediaTypeAudio]; if (audioDev == nil){ NSLog(@"Couldn't create audio capture device"); return ; } // create mic device AVCaptureDeviceInput *audioIn = [AVCaptureDeviceInput deviceInputWithDevice:audioDev error:&error]; if (error != nil){ NSLog(@"Couldn't create audio input"); return ; } // add mic device in capture object if ([self.capture canAddInput:audioIn] == NO){ NSLog(@"Couldn't add audio input"); return ; } [self.capture addInput:audioIn]; // export audio data AVCaptureAudioDataOutput *audioOutput = [[AVCaptureAudioDataOutput alloc] init]; [audioOutput setSampleBufferDelegate:self queue:dispatch_get_main_queue()]; if ([self.capture canAddOutput:audioOutput] == NO){ NSLog(@"Couldn't add audio output"); return ; } [self.capture addOutput:audioOutput]; [audioOutput connectionWithMediaType:AVMediaTypeAudio]; [self.capture startRunning]; } -(void)endCapture { if (self.capture != nil && [self.capture isRunning]){ [self.capture stopRunning]; } } - (void)captureOutput:(AVCaptureOutput *)captureOutput didOutputSampleBuffer:(CMSampleBufferRef)sampleBuffer fromConnection:(AVCaptureConnection *)connection { [self.speechRequest appendAudioSampleBuffer:sampleBuffer]; } // some Recognition Delegate @end

M. Porooshani · Answer

@cubeの答えのSwift（3.0）実装：

import UIKit import Speech import AVFoundation class ViewController: UIViewController { @IBOutlet weak var console: UITextView! var capture: AVCaptureSession? var speechRequest: SFSpeechAudioBufferRecognitionRequest? override func viewDidLoad() { super.viewDidLoad() } override func viewDidAppear(_ animated: Bool) { super.viewDidAppear(animated) startRecognizer() } func startRecognizer() { SFSpeechRecognizer.requestAuthorization { (status) in switch status { case .authorized: let locale = NSLocale(localeIdentifier: "fr_FR") let sf = SFSpeechRecognizer(locale: locale as Locale) self.speechRequest = SFSpeechAudioBufferRecognitionRequest() sf?.recognitionTask(with: self.speechRequest!, delegate: self) DispatchQueue.main.async { } case .denied: fallthrough case .notDetermined: fallthrough case.restricted: print("User Autorization Issue.") } } } func endRecognizer() { endCapture() speechRequest?.endAudio() } func startCapture() { capture = AVCaptureSession() guard let audioDev = AVCaptureDevice.defaultDevice(withMediaType: AVMediaTypeAudio) else { print("Could not get capture device.") return } guard let audioIn = try? AVCaptureDeviceInput(device: audioDev) else { print("Could not create input device.") return } guard true == capture?.canAddInput(audioIn) else { print("Couls not add input device") return } capture?.addInput(audioIn) let audioOut = AVCaptureAudioDataOutput() audioOut.setSampleBufferDelegate(self, queue: DispatchQueue.main) guard true == capture?.canAddOutput(audioOut) else { print("Could not add audio output") return } capture?.addOutput(audioOut) audioOut.connection(withMediaType: AVMediaTypeAudio) capture?.startRunning() } func endCapture() { if true == capture?.isRunning { capture?.stopRunning() } } } extension ViewController: AVCaptureAudioDataOutputSampleBufferDelegate { func captureOutput(_ captureOutput: AVCaptureOutput!, didOutputSampleBuffer sampleBuffer: CMSampleBuffer!, from connection: AVCaptureConnection!) { speechRequest?.appendAudioSampleBuffer(sampleBuffer) } } extension ViewController: SFSpeechRecognitionTaskDelegate { func speechRecognitionTask(_ task: SFSpeechRecognitionTask, didFinishRecognition recognitionResult: SFSpeechRecognitionResult) { console.text = console.text + "
" + recognitionResult.bestTranscription.formattedString } }

info.plistファイルにNSSpeechRecognitionUsageDescriptionの値を追加することを忘れないでください。追加しないとクラッシュします。

Josh · Answer

Appleの新しいネイティブの音声認識は音声終了の無音を自動的に検出しません（バグ？）。音声認識はほぼ1分間（Appleのサービスで許可されている最大期間）アクティブであるため、これは便利です。。したがって、基本的に継続的なASRが必要な場合は、デリゲートがトリガーされたときに音声認識を再起動する必要があります。

func speechRecognitionTask(task: SFSpeechRecognitionTask, didFinishSuccessfully successfully: Bool) //wether succesfully= true or not

これが録音/音声認識ですSwift私が使用するコード、それは完全に動作します。必要ない場合、マイク音量の平均パワーを計算する部分を無視します。 SFSpeechRecognitionTaskDelegateを設定することを忘れないでください。追加のコードが必要な場合は、デリゲートメソッドを教えてください。

func startNativeRecording() throws { LEVEL_LOWPASS_TRIG=0.01 //Setup Audio Session node = audioEngine.inputNode! let recordingFormat = node!.outputFormatForBus(0) node!.installTapOnBus(0, bufferSize: 1024, format: recordingFormat){(buffer, _) in self.nativeASRRequest.appendAudioPCMBuffer(buffer) //Code to animate a waveform with the microphone volume, ignore if you don't need it: var inNumberFrames:UInt32 = buffer.frameLength; var samples:Float32 = buffer.floatChannelData[0][0]; //https://github.com/Apple/Swift-evolution/blob/master/proposals/0107-unsaferawpointer.md var avgValue:Float32 = 0; vDSP_maxmgv(buffer.floatChannelData[0], 1, &avgValue, vDSP_Length(inNumberFrames)); //Accelerate Framework //vDSP_maxmgv returns peak values //vDSP_meamgv returns mean magnitude of a vector let avg3:Float32=((avgValue == 0) ? (0-100) : 20.0) var averagePower=(self.LEVEL_LOWPASS_TRIG*avg3*log10f(avgValue)) + ((1-self.LEVEL_LOWPASS_TRIG)*self.averagePowerForChannel0) ; print("AVG. POWER: "+averagePower.description) dispatch_async(dispatch_get_main_queue(), { () -> Void in //print("VU: "+vu.description) var fAvgPwr=CGFloat(averagePower) print("AvgPwr: "+fAvgPwr.description) var waveformFriendlyValue=0.5+fAvgPwr //-0.5 is AvgPwrValue when user is silent if(waveformFriendlyValue<0){waveformFriendlyValue=0} //round values <0 to 0 self.waveview.hidden=false self.waveview.updateWithLevel(waveformFriendlyValue) }) } audioEngine.prepare() try audioEngine.start() isNativeASRBusy=true nativeASRTask = nativeSpeechRecognizer?.recognitionTaskWithRequest(nativeASRRequest, delegate: self) nativeSpeechRecognizer?.delegate=self //I use this timer to track no speech timeouts, ignore if not neeeded: self.endOfSpeechTimeoutTimer = NSTimer.scheduledTimerWithTimeInterval(utteranceTimeoutSeconds, target: self, selector: #selector(ViewController.stopNativeRecording), userInfo: nil, repeats: false) }