XREAL Air からマイク録音して音データを WAV フォーマットで OpenAI Whisper API に送って文字起こしするメモ

XREAL Air からマイク録音して音データを WAV フォーマットで OpenAI Whisper API に送って文字起こしするメモです。

Unity 単体で Whisper API 連携できているとシンプルにつながる

Unity からマイク録音して音データを WAV フォーマットで Whisper API に送って文字起こしするメモ

こちらの記事で PC の Unity からマイク録音して音データを WAV フォーマットで OpenAI Whisper API に送って文字起こしすることはできています。

NRSDK を Unity プロジェクトに入れて一通り準備しておく
- NRSDK Overview – NRSDK
XREAL Air は XREAL Light 違って Head Tracking が 3DoF だったりセンサー系に制限があるので意識しておく
- Compatibility – NRSDK

このあたりの XREAL Air で動かす準備をしておけば、上記の仕組みが動く Unity プロジェクトで作業を進めることができます。Cube をクリックして録音して、それを WAV フォーマットに変換して、OpenAI Whisper API に送って文字起こしする仕組みはそのまま動きます。

具体的には、

XREAL Air の録音のターゲットになるマイクデバイス名 Android audio input を狙う
Plugins/Android/AndroidManifest.xml で RECORD_AUDIO 権限を解放
- Manifest.permission | Android Developers

あたりの対応を加えれば、無事に動きました。

RECORD_AUDIO 権限を解放

Plugins/Android/AndroidManifest.xml で RECORD_AUDIO 権限を解放します。

実際の AndroidManifest.xml です。他の作業もしているので、余計な権限入ってるかもですが、ひとまず RECORD_AUDIO が加わってます。

<?xml version="1.0" encoding="utf-8"?>
<manifest xmlns:android="http://schemas.android.com/apk/res/android" package="com.unity3d.player" xmlns:tools="http://schemas.android.com/tools" android:installLocation="preferExternal">
  <uses-sdk tools:overrideLibrary="com.nreal.glasses_sdk" />
  <supports-screens android:smallScreens="true" android:normalScreens="true" android:largeScreens="true" android:xlargeScreens="true" android:anyDensity="true" />
  <application android:theme="@style/UnityThemeSelector" android:icon="@mipmap/app_icon" android:label="@string/app_name">
    <activity android:name="com.unity3d.player.UnityPlayerActivity">
      <intent-filter>
        <action android:name="android.intent.action.MAIN" />
        <category android:name="android.intent.category.LAUNCHER" />
      </intent-filter>
    </activity>
    <meta-data android:name="nreal_sdk" android:value="true" />
    <meta-data android:name="com.nreal.supportDevices" android:value="NrealLight|NrealAir" />
  </application>
  <uses-permission android:name="android.permission.BLUETOOTH" />
  <uses-permission android:name="android.permission.RECORD_AUDIO" />
</manifest>

Unity で Cube を準備

Unity で以下のように Cube を準備します。ほか EventSystem やカメラに Physics Raycaster を仕込んでいる状態で、クリックすれば動作する状態です。

XREAL Air では EventSystem まわりを実装しておけば、スマホのコントローラーでポインタを当ててタップすると、クリック動作として受け取ってくれます。

プログラム

Cube に以下のように CubeEvent.cs を割り当てます。

using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using UnityEngine.EventSystems;
using UnityEngine.Networking;
using System;
using System.IO;
using System.Text;
 
public class CubeEvent: MonoBehaviour, IPointerClickHandler
{
    // マイクの開始・終了管理
    bool flagMicRecordStart = false;

    // マイクデバイスがキャッチできたかどうか
    bool catchedMicDevice = false;

    // 現在録音するマイクデバイス名
    string currentRecordingMicDeviceName = "null";

    // PC の録音のターゲットになるマイクデバイス名
    // これはお使いのデバイスで変わります
    // 完全一致でないと受け取れないので注意
    string recordingTargetMicDeviceName = "Krisp Microphone (Krisp Audio)";

    // XREAL Air の録音のターゲットになるマイクデバイス名 "Android audio input"
    string recordingTargetMicDeviceNameForXREALAir = "Android audio input";

    // ヘッダーサイズ
    int HeaderByteSize = 44;

    // BitsPerSample
    int BitsPerSample = 16;

    // AudioFormat
    int AudioFormat = 1;

    // 録音する AudioClip
    AudioClip recordedAudioClip;

    // サンプリング周波数
    int samplingFrequency = 44100;

    // 最大録音時間[sec]
    int maxTimeSeconds = 10;

    // Wav データ
    byte[] dataWav;

    // OpenAIAPIKey
    string OpenAIAPIKey = "apiKey";

    void Start()
    {
        catchedMicDevice = false;

        Launch();
    }

    void Launch()
    {

        // マイクデバイスを探す
        foreach (string device in Microphone.devices)
        {
            Debug.Log($"Mic device name : {device}");

            // PC 用のマイクデバイスを割り当て
            if (device == recordingTargetMicDeviceName)
            {
                Debug.Log($"{recordingTargetMicDeviceName} searched");

                currentRecordingMicDeviceName = device;

                catchedMicDevice = true;
            }

            // XREAL Air 用のマイクデバイスを割り当て
            if (device == recordingTargetMicDeviceNameForXREALAir)
            {
                Debug.Log($"{recordingTargetMicDeviceNameForXREALAir} serched");

                currentRecordingMicDeviceName = device;

                catchedMicDevice = true;
            }

        }

        if (catchedMicDevice)
        {
            Debug.Log($"マイク捜索成功");
            Debug.Log($"currentRecordingMicDeviceName : {currentRecordingMicDeviceName}");
        }
        else
        {
            Debug.Log($"マイク捜索失敗");
        }

    }

    void Update()
    {

    }

    void RecordStart()
    {
        // マイクの録音を開始して AudioClip を割り当て
        recordedAudioClip = Microphone.Start(currentRecordingMicDeviceName, false, maxTimeSeconds, samplingFrequency);
    }

    void RecordStop()
    {
        // マイクの停止
        Microphone.End(currentRecordingMicDeviceName);

        Debug.Log($"WAV データ作成開始");

        // using を使ってメモリ開放を自動で行う
        using (MemoryStream currentMemoryStream = new MemoryStream())
        {
            // ChunkID RIFF
            byte[] bufRIFF = Encoding.ASCII.GetBytes("RIFF");
            currentMemoryStream.Write(bufRIFF, 0, bufRIFF.Length);

            // ChunkSize
            byte[] bufChunkSize = BitConverter.GetBytes((UInt32)(HeaderByteSize + recordedAudioClip.samples * recordedAudioClip.channels * BitsPerSample / 8));
            currentMemoryStream.Write(bufChunkSize, 0, bufChunkSize.Length);

            // Format WAVE
            byte[] bufFormatWAVE = Encoding.ASCII.GetBytes("WAVE");
            currentMemoryStream.Write(bufFormatWAVE, 0, bufFormatWAVE.Length);

            // Subchunk1ID fmt
            byte[] bufSubchunk1ID = Encoding.ASCII.GetBytes("fmt ");
            currentMemoryStream.Write(bufSubchunk1ID, 0, bufSubchunk1ID.Length);

            // Subchunk1Size (16 for PCM)
            byte[] bufSubchunk1Size = BitConverter.GetBytes((UInt32)16);
            currentMemoryStream.Write(bufSubchunk1Size, 0, bufSubchunk1Size.Length);

            // AudioFormat (PCM=1)
            byte[] bufAudioFormat = BitConverter.GetBytes((UInt16)AudioFormat);
            currentMemoryStream.Write(bufAudioFormat, 0, bufAudioFormat.Length);

            // NumChannels
            byte[] bufNumChannels = BitConverter.GetBytes((UInt16)recordedAudioClip.channels);
            currentMemoryStream.Write(bufNumChannels, 0, bufNumChannels.Length);

            // SampleRate
            byte[] bufSampleRate = BitConverter.GetBytes((UInt32)recordedAudioClip.frequency);
            currentMemoryStream.Write(bufSampleRate, 0, bufSampleRate.Length);

            // ByteRate (=SampleRate * NumChannels * BitsPerSample/8)
            byte[] bufByteRate = BitConverter.GetBytes((UInt32)(recordedAudioClip.samples * recordedAudioClip.channels * BitsPerSample / 8));
            currentMemoryStream.Write(bufByteRate, 0, bufByteRate.Length);

            // BlockAlign (=NumChannels * BitsPerSample/8)
            byte[] bufBlockAlign = BitConverter.GetBytes((UInt16)(recordedAudioClip.channels * BitsPerSample / 8));
            currentMemoryStream.Write(bufBlockAlign, 0, bufBlockAlign.Length);

            // BitsPerSample
            byte[] bufBitsPerSample = BitConverter.GetBytes((UInt16)BitsPerSample);
            currentMemoryStream.Write(bufBitsPerSample, 0, bufBitsPerSample.Length);

            // Subchunk2ID data
            byte[] bufSubchunk2ID = Encoding.ASCII.GetBytes("data");
            currentMemoryStream.Write(bufSubchunk2ID, 0, bufSubchunk2ID.Length);

            // Subchuk2Size
            byte[] bufSubchuk2Size = BitConverter.GetBytes((UInt32)(recordedAudioClip.samples * recordedAudioClip.channels * BitsPerSample / 8));
            currentMemoryStream.Write(bufSubchuk2Size, 0, bufSubchuk2Size.Length);

            // Data
            float[] floatData = new float[recordedAudioClip.samples * recordedAudioClip.channels];
            recordedAudioClip.GetData(floatData, 0);

            foreach (float f in floatData)
            {
                byte[] bufData = BitConverter.GetBytes((short)(f * short.MaxValue));
                currentMemoryStream.Write(bufData, 0, bufData.Length);
            }

            Debug.Log($"WAV データ作成完了");

            dataWav = currentMemoryStream.ToArray();

            Debug.Log($"dataWav.Length {dataWav.Length}");

            /*
            // 検証用にファイル保存
            // Assets/record.wav に保存されます
            string pathSaveWav = Path.Combine(Application.dataPath, "record.wav");
 
            // using を使ってメモリ開放を自動で行う
            using (FileStream currentFileStream = new FileStream(pathSaveWav, FileMode.Create))
            {
                currentFileStream.Write(dataWav, 0, dataWav.Length);
 
                Debug.Log($"保存完了 path : {pathSaveWav}");
            }
            */

            StartCoroutine(PostAPI());
        }

    }

    public void OnPointerClick(PointerEventData eventData)
    {
        if (catchedMicDevice)
        {
            if (flagMicRecordStart)
            {
                // Stop
                flagMicRecordStart = false;
                Debug.Log($"Mic Record Stop");

                RecordStop();

            }
            else
            {
                // Start
                flagMicRecordStart = true;
                Debug.Log($"Mic Record Start");

                RecordStart();
            }
        }

    }

    IEnumerator PostAPI()
    {
        // IMultipartFormSection で multipart/form-data のデータとして送れます
        // https://docs.unity3d.com/ja/2018.4/Manual/UnityWebRequest-SendingForm.html
        // https://docs.unity3d.com/ja/2019.4/ScriptReference/Networking.IMultipartFormSection.html
        // https://docs.unity3d.com/ja/2020.3/ScriptReference/Networking.MultipartFormDataSection.html
        List<IMultipartFormSection> formData = new List<IMultipartFormSection>();

        // https://platform.openai.com/docs/api-reference/audio/createTranscription
        // Whisper モデルを使う
        formData.Add(new MultipartFormDataSection("model", "whisper-1"));
        // 日本語で返答
        formData.Add(new MultipartFormDataSection("language", "ja"));
        // WAV データを入れる
        formData.Add(new MultipartFormFileSection("file", dataWav, "whisper01.wav", "multipart/form-data"));

        // HTTP リクエストする(POST メソッド) UnityWebRequest を呼び出し
        // 第 2 引数で上記のフォームデータを割り当てて multipart/form-data のデータとして送ります
        string urlWhisperAPI = "https://api.openai.com/v1/audio/transcriptions";
        UnityWebRequest request = UnityWebRequest.Post(urlWhisperAPI, formData);

        // OpenAI 認証は Authorization ヘッダーで Bearer のあとに API トークンを入れる
        request.SetRequestHeader("Authorization", $"Bearer {OpenAIAPIKey}");

        // ダウンロード（サーバ→Unity）のハンドラを作成
        request.downloadHandler = new DownloadHandlerBuffer();

        Debug.Log("リクエスト開始");

        // リクエスト開始
        yield return request.SendWebRequest();


        // 結果によって分岐
        switch (request.result)
        {
            case UnityWebRequest.Result.InProgress:
                Debug.Log("リクエスト中");
                break;

            case UnityWebRequest.Result.ProtocolError:
                Debug.Log("ProtocolError");
                Debug.Log(request.responseCode);
                Debug.Log(request.error);
                break;

            case UnityWebRequest.Result.ConnectionError:
                Debug.Log("ConnectionError");
                break;

            case UnityWebRequest.Result.Success:
                Debug.Log("リクエスト成功");

                // コンソールに表示
                Debug.Log($"responseData: {request.downloadHandler.text}");

                break;
        }


    }
}

実装として加えた部分です。

    // XREAL Air の録音のターゲットになるマイクデバイス名 "Android audio input"
    string recordingTargetMicDeviceNameForXREALAir = "Android audio input";

こちらでマイクデバイス名を用意して、

    void Launch()
    {

        // マイクデバイスを探す
        foreach (string device in Microphone.devices)
        {
            Debug.Log($"Mic device name : {device}");

            // PC 用のマイクデバイスを割り当て
            if (device == recordingTargetMicDeviceName)
            {
                Debug.Log($"{recordingTargetMicDeviceName} searched");

                currentRecordingMicDeviceName = device;

                catchedMicDevice = true;
            }

            // XREAL Air 用のマイクデバイスを割り当て
            if (device == recordingTargetMicDeviceNameForXREALAir)
            {
                Debug.Log($"{recordingTargetMicDeviceNameForXREALAir} serched");

                currentRecordingMicDeviceName = device;

                catchedMicDevice = true;
            }

        }

        if (catchedMicDevice)
        {
            Debug.Log($"マイク捜索成功");
            Debug.Log($"currentRecordingMicDeviceName : {currentRecordingMicDeviceName}");
        }
        else
        {
            Debug.Log($"マイク捜索失敗");
        }

    }

Launch の部分でマイク捜索していて、デバッグしやすいように PC でのデバイスが見つかった処理と XREAL Air のマイクデバイス Android audio input が見つかった処理を並列に書いています。これで PC でも XREAL Air でも両方でマイクの検証が行えます。

        // 日本語で返答
        formData.Add(new MultipartFormDataSection("language", "ja"));

Whisper API の部分は、日本語で返答するよう language パラメータを ja と伝えて日本語に返すようにお願いしています。

動かしてみる

というわけで XREAL Air で動かした様子です。Debug.Log をログで出す部分は追加実装してます。

実際に Cube をクリックして録音して、それを WAV フォーマットに変換して、OpenAI Whisper API に送って文字起こしこのように日本語で返答を返してくれます。

さらに実装した様子

できたー！XREAL Air のマイクで音声録音したデータを OpenAI API の Whisper API に送って日本語で文字起こしして受け取れました～。これでいろいろな仕組みにつなげられるます！ #AR #XREAL pic.twitter.com/Dr5avY1z7b

— Tanaka Seigo (@1ft_seabass) September 12, 2023