続きを実行しようとしています。 iOS 10ベータ版でAVCapture
を使用した音声認識。 CMSampleBuffers
を継続的に取得するためにcaptureOutput(...)
をセットアップしています。これらのバッファーを、以前に次のようにセットアップしたSFSpeechAudioBufferRecognitionRequest
に直接入れます。
... do some setup
SFSpeechRecognizer.requestAuthorization { authStatus in
if authStatus == SFSpeechRecognizerAuthorizationStatus.authorized {
self.m_recognizer = SFSpeechRecognizer()
self.m_recognRequest = SFSpeechAudioBufferRecognitionRequest()
self.m_recognRequest?.shouldReportPartialResults = false
self.m_isRecording = true
} else {
print("not authorized")
}
}
.... do further setup
func captureOutput(_ captureOutput: AVCaptureOutput!, didOutputSampleBuffer sampleBuffer: CMSampleBuffer!, from connection: AVCaptureConnection!) {
if(!m_AV_initialized) {
print("captureOutput(...): not initialized !")
return
}
if(!m_isRecording) {
return
}
let formatDesc = CMSampleBufferGetFormatDescription(sampleBuffer)
let mediaType = CMFormatDescriptionGetMediaType(formatDesc!)
if (mediaType == kCMMediaType_Audio) {
// process audio here
m_recognRequest?.appendAudioSampleBuffer(sampleBuffer)
}
return
}
すべてが数秒間機能します。そうすると、captureOutputはもう呼び出されません。 appendAudioSampleBuffer(sampleBuffer)という行をコメントアウトすると、アプリが実行されている限り(予想どおり)captureOutputが呼び出されます。明らかに、サンプルバッファを音声認識エンジンに配置すると、それ以上の実行がブロックされます。しばらくすると利用可能なバッファが消費され、バッファを取得できなくなったためにプロセスが何らかの形で停止したのでしょうか?
最初の2秒間に記録されたすべてが正しい認識につながることに言及する必要があります。 Appleはベータ版のドキュメントにテキストを追加しなかったため。SFSpeechAudioBufferRecognitionRequest.endAudio()の使用方法は?
誰かがここで何かを知っていますか?
ありがとう、クリス
SpeakToMeサンプルをSwift音声認識WWDC開発者トークからのコードをObjective-Cに変換し、それは私のために働きました。Swiftについては、 https://developer.Apple。 com/videos/play/wwdc2016/509 / 、またはObjective-Cについては以下を参照してください。
- (void) viewDidAppear:(BOOL)animated {
_recognizer = [[SFSpeechRecognizer alloc] initWithLocale:[NSLocale localeWithLocaleIdentifier:@"en-US"]];
[_recognizer setDelegate:self];
[SFSpeechRecognizer requestAuthorization:^(SFSpeechRecognizerAuthorizationStatus authStatus) {
switch (authStatus) {
case SFSpeechRecognizerAuthorizationStatusAuthorized:
//User gave access to speech recognition
NSLog(@"Authorized");
break;
case SFSpeechRecognizerAuthorizationStatusDenied:
//User denied access to speech recognition
NSLog(@"SFSpeechRecognizerAuthorizationStatusDenied");
break;
case SFSpeechRecognizerAuthorizationStatusRestricted:
//Speech recognition restricted on this device
NSLog(@"SFSpeechRecognizerAuthorizationStatusRestricted");
break;
case SFSpeechRecognizerAuthorizationStatusNotDetermined:
//Speech recognition not yet authorized
break;
default:
NSLog(@"Default");
break;
}
}];
audioEngine = [[AVAudioEngine alloc] init];
_speechSynthesizer = [[AVSpeechSynthesizer alloc] init];
[_speechSynthesizer setDelegate:self];
}
-(void)startRecording
{
[self clearLogs:nil];
NSError * outError;
AVAudioSession *audioSession = [AVAudioSession sharedInstance];
[audioSession setCategory:AVAudioSessionCategoryRecord error:&outError];
[audioSession setMode:AVAudioSessionModeMeasurement error:&outError];
[audioSession setActive:true withOptions:AVAudioSessionSetActiveOptionNotifyOthersOnDeactivation error:&outError];
request2 = [[SFSpeechAudioBufferRecognitionRequest alloc] init];
inputNode = [audioEngine inputNode];
if (request2 == nil) {
NSLog(@"Unable to created a SFSpeechAudioBufferRecognitionRequest object");
}
if (inputNode == nil) {
NSLog(@"Unable to created a inputNode object");
}
request2.shouldReportPartialResults = true;
_currentTask = [_recognizer recognitionTaskWithRequest:request2
delegate:self];
[inputNode installTapOnBus:0 bufferSize:4096 format:[inputNode outputFormatForBus:0] block:^(AVAudioPCMBuffer *buffer, AVAudioTime *when){
NSLog(@"Block tap!");
[request2 appendAudioPCMBuffer:buffer];
}];
[audioEngine prepare];
[audioEngine startAndReturnError:&outError];
NSLog(@"Error %@", outError);
}
- (void)speechRecognitionTask:(SFSpeechRecognitionTask *)task didFinishRecognition:(SFSpeechRecognitionResult *)result {
NSLog(@"speechRecognitionTask:(SFSpeechRecognitionTask *)task didFinishRecognition");
NSString * translatedString = [[[result bestTranscription] formattedString] stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]];
[self log:translatedString];
if ([result isFinal]) {
[audioEngine stop];
[inputNode removeTapOnBus:0];
_currentTask = nil;
request2 = nil;
}
}
SFSpeechRecognizerを継続的に使用することに成功しました。主なポイントは、AVCaptureSessionを使用してオーディオをキャプチャし、SpeechRecognizerに転送することです。申し訳ありませんが、私はSwiftが貧弱なので、ObjCバージョンだけです。
ここに私のサンプルコードがあります(いくつかの重要なマークが付いているUIコードを省略します)。
@interface ViewController ()<AVCaptureAudioDataOutputSampleBufferDelegate,SFSpeechRecognitionTaskDelegate>
@property (nonatomic, strong) AVCaptureSession *capture;
@property (nonatomic, strong) SFSpeechAudioBufferRecognitionRequest *speechRequest;
@end
@implementation ViewController
- (void)startRecognizer
{
[SFSpeechRecognizer requestAuthorization:^(SFSpeechRecognizerAuthorizationStatus status) {
if (status == SFSpeechRecognizerAuthorizationStatusAuthorized){
NSLocale *local =[[NSLocale alloc] initWithLocaleIdentifier:@"fr_FR"];
SFSpeechRecognizer *sf =[[SFSpeechRecognizer alloc] initWithLocale:local];
self.speechRequest = [[SFSpeechAudioBufferRecognitionRequest alloc] init];
[sf recognitionTaskWithRequest:self.speechRequest delegate:self];
// should call startCapture method in main queue or it may crash
dispatch_async(dispatch_get_main_queue(), ^{
[self startCapture];
});
}
}];
}
- (void)endRecognizer
{
// END capture and END voice Reco
// or Apple will terminate this task after 30000ms.
[self endCapture];
[self.speechRequest endAudio];
}
- (void)startCapture
{
NSError *error;
self.capture = [[AVCaptureSession alloc] init];
AVCaptureDevice *audioDev = [AVCaptureDevice defaultDeviceWithMediaType:AVMediaTypeAudio];
if (audioDev == nil){
NSLog(@"Couldn't create audio capture device");
return ;
}
// create mic device
AVCaptureDeviceInput *audioIn = [AVCaptureDeviceInput deviceInputWithDevice:audioDev error:&error];
if (error != nil){
NSLog(@"Couldn't create audio input");
return ;
}
// add mic device in capture object
if ([self.capture canAddInput:audioIn] == NO){
NSLog(@"Couldn't add audio input");
return ;
}
[self.capture addInput:audioIn];
// export audio data
AVCaptureAudioDataOutput *audioOutput = [[AVCaptureAudioDataOutput alloc] init];
[audioOutput setSampleBufferDelegate:self queue:dispatch_get_main_queue()];
if ([self.capture canAddOutput:audioOutput] == NO){
NSLog(@"Couldn't add audio output");
return ;
}
[self.capture addOutput:audioOutput];
[audioOutput connectionWithMediaType:AVMediaTypeAudio];
[self.capture startRunning];
}
-(void)endCapture
{
if (self.capture != nil && [self.capture isRunning]){
[self.capture stopRunning];
}
}
- (void)captureOutput:(AVCaptureOutput *)captureOutput didOutputSampleBuffer:(CMSampleBufferRef)sampleBuffer fromConnection:(AVCaptureConnection *)connection
{
[self.speechRequest appendAudioSampleBuffer:sampleBuffer];
}
// some Recognition Delegate
@end
@cubeの答えのSwift(3.0)実装:
import UIKit
import Speech
import AVFoundation
class ViewController: UIViewController {
@IBOutlet weak var console: UITextView!
var capture: AVCaptureSession?
var speechRequest: SFSpeechAudioBufferRecognitionRequest?
override func viewDidLoad() {
super.viewDidLoad()
}
override func viewDidAppear(_ animated: Bool) {
super.viewDidAppear(animated)
startRecognizer()
}
func startRecognizer() {
SFSpeechRecognizer.requestAuthorization { (status) in
switch status {
case .authorized:
let locale = NSLocale(localeIdentifier: "fr_FR")
let sf = SFSpeechRecognizer(locale: locale as Locale)
self.speechRequest = SFSpeechAudioBufferRecognitionRequest()
sf?.recognitionTask(with: self.speechRequest!, delegate: self)
DispatchQueue.main.async {
}
case .denied:
fallthrough
case .notDetermined:
fallthrough
case.restricted:
print("User Autorization Issue.")
}
}
}
func endRecognizer() {
endCapture()
speechRequest?.endAudio()
}
func startCapture() {
capture = AVCaptureSession()
guard let audioDev = AVCaptureDevice.defaultDevice(withMediaType: AVMediaTypeAudio) else {
print("Could not get capture device.")
return
}
guard let audioIn = try? AVCaptureDeviceInput(device: audioDev) else {
print("Could not create input device.")
return
}
guard true == capture?.canAddInput(audioIn) else {
print("Couls not add input device")
return
}
capture?.addInput(audioIn)
let audioOut = AVCaptureAudioDataOutput()
audioOut.setSampleBufferDelegate(self, queue: DispatchQueue.main)
guard true == capture?.canAddOutput(audioOut) else {
print("Could not add audio output")
return
}
capture?.addOutput(audioOut)
audioOut.connection(withMediaType: AVMediaTypeAudio)
capture?.startRunning()
}
func endCapture() {
if true == capture?.isRunning {
capture?.stopRunning()
}
}
}
extension ViewController: AVCaptureAudioDataOutputSampleBufferDelegate {
func captureOutput(_ captureOutput: AVCaptureOutput!, didOutputSampleBuffer sampleBuffer: CMSampleBuffer!, from connection: AVCaptureConnection!) {
speechRequest?.appendAudioSampleBuffer(sampleBuffer)
}
}
extension ViewController: SFSpeechRecognitionTaskDelegate {
func speechRecognitionTask(_ task: SFSpeechRecognitionTask, didFinishRecognition recognitionResult: SFSpeechRecognitionResult) {
console.text = console.text + "\n" + recognitionResult.bestTranscription.formattedString
}
}
info.plist
ファイルにNSSpeechRecognitionUsageDescription
の値を追加することを忘れないでください。追加しないとクラッシュします。
Appleの新しいネイティブの音声認識は音声終了の無音を自動的に検出しません(バグ?)。音声認識はほぼ1分間(Appleのサービスで許可されている最大期間)アクティブであるため、これは便利です。 。したがって、基本的に継続的なASRが必要な場合は、デリゲートがトリガーされたときに音声認識を再起動する必要があります。
func speechRecognitionTask(task: SFSpeechRecognitionTask, didFinishSuccessfully successfully: Bool) //wether succesfully= true or not
これが録音/音声認識ですSwift私が使用するコード、それは完全に動作します。必要ない場合、マイク音量の平均パワーを計算する部分を無視します。 SFSpeechRecognitionTaskDelegateを設定することを忘れないでください。追加のコードが必要な場合は、デリゲートメソッドを教えてください。
func startNativeRecording() throws {
LEVEL_LOWPASS_TRIG=0.01
//Setup Audio Session
node = audioEngine.inputNode!
let recordingFormat = node!.outputFormatForBus(0)
node!.installTapOnBus(0, bufferSize: 1024, format: recordingFormat){(buffer, _) in
self.nativeASRRequest.appendAudioPCMBuffer(buffer)
//Code to animate a waveform with the microphone volume, ignore if you don't need it:
var inNumberFrames:UInt32 = buffer.frameLength;
var samples:Float32 = buffer.floatChannelData[0][0]; //https://github.com/Apple/Swift-evolution/blob/master/proposals/0107-unsaferawpointer.md
var avgValue:Float32 = 0;
vDSP_maxmgv(buffer.floatChannelData[0], 1, &avgValue, vDSP_Length(inNumberFrames)); //Accelerate Framework
//vDSP_maxmgv returns peak values
//vDSP_meamgv returns mean magnitude of a vector
let avg3:Float32=((avgValue == 0) ? (0-100) : 20.0)
var averagePower=(self.LEVEL_LOWPASS_TRIG*avg3*log10f(avgValue)) + ((1-self.LEVEL_LOWPASS_TRIG)*self.averagePowerForChannel0) ;
print("AVG. POWER: "+averagePower.description)
dispatch_async(dispatch_get_main_queue(), { () -> Void in
//print("VU: "+vu.description)
var fAvgPwr=CGFloat(averagePower)
print("AvgPwr: "+fAvgPwr.description)
var waveformFriendlyValue=0.5+fAvgPwr //-0.5 is AvgPwrValue when user is silent
if(waveformFriendlyValue<0){waveformFriendlyValue=0} //round values <0 to 0
self.waveview.hidden=false
self.waveview.updateWithLevel(waveformFriendlyValue)
})
}
audioEngine.prepare()
try audioEngine.start()
isNativeASRBusy=true
nativeASRTask = nativeSpeechRecognizer?.recognitionTaskWithRequest(nativeASRRequest, delegate: self)
nativeSpeechRecognizer?.delegate=self
//I use this timer to track no speech timeouts, ignore if not neeeded:
self.endOfSpeechTimeoutTimer = NSTimer.scheduledTimerWithTimeInterval(utteranceTimeoutSeconds, target: self, selector: #selector(ViewController.stopNativeRecording), userInfo: nil, repeats: false)
}