Spaces:
Running
Running
| // | |
| // ViewController.m | |
| // whisper.objc | |
| // | |
| // Created by Georgi Gerganov on 23.10.22. | |
| // | |
| #import "ViewController.h" | |
| #import "whisper.h" | |
| #define NUM_BYTES_PER_BUFFER 16*1024 | |
| // callback used to process captured audio | |
| void AudioInputCallback(void * inUserData, | |
| AudioQueueRef inAQ, | |
| AudioQueueBufferRef inBuffer, | |
| const AudioTimeStamp * inStartTime, | |
| UInt32 inNumberPacketDescriptions, | |
| const AudioStreamPacketDescription * inPacketDescs); | |
| @interface ViewController () | |
| @property (weak, nonatomic) IBOutlet UILabel *labelStatusInp; | |
| @property (weak, nonatomic) IBOutlet UIButton *buttonToggleCapture; | |
| @property (weak, nonatomic) IBOutlet UIButton *buttonTranscribe; | |
| @property (weak, nonatomic) IBOutlet UIButton *buttonRealtime; | |
| @property (weak, nonatomic) IBOutlet UITextView *textviewResult; | |
| @end | |
| @implementation ViewController | |
| - (void)setupAudioFormat:(AudioStreamBasicDescription*)format | |
| { | |
| format->mSampleRate = WHISPER_SAMPLE_RATE; | |
| format->mFormatID = kAudioFormatLinearPCM; | |
| format->mFramesPerPacket = 1; | |
| format->mChannelsPerFrame = 1; | |
| format->mBytesPerFrame = 2; | |
| format->mBytesPerPacket = 2; | |
| format->mBitsPerChannel = 16; | |
| format->mReserved = 0; | |
| format->mFormatFlags = kLinearPCMFormatFlagIsSignedInteger; | |
| } | |
| - (void)viewDidLoad { | |
| [super viewDidLoad]; | |
| // whisper.cpp initialization | |
| { | |
| // load the model | |
| NSString *modelPath = [[NSBundle mainBundle] pathForResource:@"ggml-base.en" ofType:@"bin"]; | |
| // check if the model exists | |
| if (![[NSFileManager defaultManager] fileExistsAtPath:modelPath]) { | |
| NSLog(@"Model file not found"); | |
| return; | |
| } | |
| NSLog(@"Loading model from %@", modelPath); | |
| // create ggml context | |
| struct whisper_context_params cparams = whisper_context_default_params(); | |
| #if TARGET_OS_SIMULATOR | |
| cparams.use_gpu = false; | |
| NSLog(@"Running on simulator, using CPU"); | |
| #endif | |
| stateInp.ctx = whisper_init_from_file_with_params([modelPath UTF8String], cparams); | |
| // check if the model was loaded successfully | |
| if (stateInp.ctx == NULL) { | |
| NSLog(@"Failed to load model"); | |
| return; | |
| } | |
| } | |
| // initialize audio format and buffers | |
| { | |
| [self setupAudioFormat:&stateInp.dataFormat]; | |
| stateInp.n_samples = 0; | |
| stateInp.audioBufferI16 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(int16_t)); | |
| stateInp.audioBufferF32 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(float)); | |
| } | |
| stateInp.isTranscribing = false; | |
| stateInp.isRealtime = false; | |
| } | |
| -(IBAction) stopCapturing { | |
| NSLog(@"Stop capturing"); | |
| _labelStatusInp.text = @"Status: Idle"; | |
| [_buttonToggleCapture setTitle:@"Start capturing" forState:UIControlStateNormal]; | |
| [_buttonToggleCapture setBackgroundColor:[UIColor grayColor]]; | |
| stateInp.isCapturing = false; | |
| AudioQueueStop(stateInp.queue, true); | |
| for (int i = 0; i < NUM_BUFFERS; i++) { | |
| AudioQueueFreeBuffer(stateInp.queue, stateInp.buffers[i]); | |
| } | |
| AudioQueueDispose(stateInp.queue, true); | |
| } | |
| - (IBAction)toggleCapture:(id)sender { | |
| if (stateInp.isCapturing) { | |
| // stop capturing | |
| [self stopCapturing]; | |
| return; | |
| } | |
| // initiate audio capturing | |
| NSLog(@"Start capturing"); | |
| stateInp.n_samples = 0; | |
| stateInp.vc = (__bridge void *)(self); | |
| OSStatus status = AudioQueueNewInput(&stateInp.dataFormat, | |
| AudioInputCallback, | |
| &stateInp, | |
| CFRunLoopGetCurrent(), | |
| kCFRunLoopCommonModes, | |
| 0, | |
| &stateInp.queue); | |
| if (status == 0) { | |
| for (int i = 0; i < NUM_BUFFERS; i++) { | |
| AudioQueueAllocateBuffer(stateInp.queue, NUM_BYTES_PER_BUFFER, &stateInp.buffers[i]); | |
| AudioQueueEnqueueBuffer (stateInp.queue, stateInp.buffers[i], 0, NULL); | |
| } | |
| stateInp.isCapturing = true; | |
| status = AudioQueueStart(stateInp.queue, NULL); | |
| if (status == 0) { | |
| _labelStatusInp.text = @"Status: Capturing"; | |
| [sender setTitle:@"Stop Capturing" forState:UIControlStateNormal]; | |
| [_buttonToggleCapture setBackgroundColor:[UIColor redColor]]; | |
| } | |
| } | |
| if (status != 0) { | |
| [self stopCapturing]; | |
| } | |
| } | |
| - (IBAction)onTranscribePrepare:(id)sender { | |
| _textviewResult.text = @"Processing - please wait ..."; | |
| if (stateInp.isRealtime) { | |
| [self onRealtime:(id)sender]; | |
| } | |
| if (stateInp.isCapturing) { | |
| [self stopCapturing]; | |
| } | |
| } | |
| - (IBAction)onRealtime:(id)sender { | |
| stateInp.isRealtime = !stateInp.isRealtime; | |
| if (stateInp.isRealtime) { | |
| [_buttonRealtime setBackgroundColor:[UIColor greenColor]]; | |
| } else { | |
| [_buttonRealtime setBackgroundColor:[UIColor grayColor]]; | |
| } | |
| NSLog(@"Realtime: %@", stateInp.isRealtime ? @"ON" : @"OFF"); | |
| } | |
| - (IBAction)onTranscribe:(id)sender { | |
| if (stateInp.isTranscribing) { | |
| return; | |
| } | |
| NSLog(@"Processing %d samples", stateInp.n_samples); | |
| stateInp.isTranscribing = true; | |
| // dispatch the model to a background thread | |
| dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{ | |
| // process captured audio | |
| // convert I16 to F32 | |
| for (int i = 0; i < self->stateInp.n_samples; i++) { | |
| self->stateInp.audioBufferF32[i] = (float)self->stateInp.audioBufferI16[i] / 32768.0f; | |
| } | |
| // run the model | |
| struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); | |
| // get maximum number of threads on this device (max 8) | |
| const int max_threads = MIN(8, (int)[[NSProcessInfo processInfo] processorCount]); | |
| params.print_realtime = true; | |
| params.print_progress = false; | |
| params.print_timestamps = true; | |
| params.print_special = false; | |
| params.translate = false; | |
| params.language = "en"; | |
| params.n_threads = max_threads; | |
| params.offset_ms = 0; | |
| params.no_context = true; | |
| params.single_segment = self->stateInp.isRealtime; | |
| params.no_timestamps = params.single_segment; | |
| CFTimeInterval startTime = CACurrentMediaTime(); | |
| whisper_reset_timings(self->stateInp.ctx); | |
| if (whisper_full(self->stateInp.ctx, params, self->stateInp.audioBufferF32, self->stateInp.n_samples) != 0) { | |
| NSLog(@"Failed to run the model"); | |
| self->_textviewResult.text = @"Failed to run the model"; | |
| return; | |
| } | |
| whisper_print_timings(self->stateInp.ctx); | |
| CFTimeInterval endTime = CACurrentMediaTime(); | |
| NSLog(@"\nProcessing time: %5.3f, on %d threads", endTime - startTime, params.n_threads); | |
| // result text | |
| NSString *result = @""; | |
| int n_segments = whisper_full_n_segments(self->stateInp.ctx); | |
| for (int i = 0; i < n_segments; i++) { | |
| const char * text_cur = whisper_full_get_segment_text(self->stateInp.ctx, i); | |
| // append the text to the result | |
| result = [result stringByAppendingString:[NSString stringWithUTF8String:text_cur]]; | |
| } | |
| const float tRecording = (float)self->stateInp.n_samples / (float)self->stateInp.dataFormat.mSampleRate; | |
| // append processing time | |
| result = [result stringByAppendingString:[NSString stringWithFormat:@"\n\n[recording time: %5.3f s]", tRecording]]; | |
| result = [result stringByAppendingString:[NSString stringWithFormat:@" \n[processing time: %5.3f s]", endTime - startTime]]; | |
| // dispatch the result to the main thread | |
| dispatch_async(dispatch_get_main_queue(), ^{ | |
| self->_textviewResult.text = result; | |
| self->stateInp.isTranscribing = false; | |
| }); | |
| }); | |
| } | |
| // | |
| // Callback implementation | |
| // | |
| void AudioInputCallback(void * inUserData, | |
| AudioQueueRef inAQ, | |
| AudioQueueBufferRef inBuffer, | |
| const AudioTimeStamp * inStartTime, | |
| UInt32 inNumberPacketDescriptions, | |
| const AudioStreamPacketDescription * inPacketDescs) | |
| { | |
| StateInp * stateInp = (StateInp*)inUserData; | |
| if (!stateInp->isCapturing) { | |
| NSLog(@"Not capturing, ignoring audio"); | |
| return; | |
| } | |
| const int n = inBuffer->mAudioDataByteSize / 2; | |
| NSLog(@"Captured %d new samples", n); | |
| if (stateInp->n_samples + n > MAX_AUDIO_SEC*SAMPLE_RATE) { | |
| NSLog(@"Too much audio data, ignoring"); | |
| dispatch_async(dispatch_get_main_queue(), ^{ | |
| ViewController * vc = (__bridge ViewController *)(stateInp->vc); | |
| [vc stopCapturing]; | |
| }); | |
| return; | |
| } | |
| for (int i = 0; i < n; i++) { | |
| stateInp->audioBufferI16[stateInp->n_samples + i] = ((short*)inBuffer->mAudioData)[i]; | |
| } | |
| stateInp->n_samples += n; | |
| // put the buffer back in the queue | |
| AudioQueueEnqueueBuffer(stateInp->queue, inBuffer, 0, NULL); | |
| if (stateInp->isRealtime) { | |
| // dipatch onTranscribe() to the main thread | |
| dispatch_async(dispatch_get_main_queue(), ^{ | |
| ViewController * vc = (__bridge ViewController *)(stateInp->vc); | |
| [vc onTranscribe:nil]; | |
| }); | |
| } | |
| } | |
| @end | |