byted-ailab-speech-sdk

v4.0.10

Published

8 months ago

## 环境说明当前sdk支持司内/tob接入，以下代码中 openspeech.bytedance.com 是tob接入的公网服务域名，司内地址咨询oncall

0High
0Medium
0Low

接入文档

环境说明

当前sdk支持司内/tob接入，以下代码中 openspeech.bytedance.com 是tob接入的公网服务域名，司内地址咨询oncall

接入前准备

前端sdk接入，由于安全原因，需要业务方服务端支持临时鉴权, 并返回临时鉴权token供前端sdk使用，服务端获取方式如下：

export async function getToken(appid: string, accessKey: string) {
  const result = await fetch('https://openspeech.bytedance.com/api/v1/sts/token', {
    method: 'POST',
    headers: {
      Authorization: `Bearer; ${accessKey}`,
      'Content-Type': 'application/json',
    },
    body: JSON.stringify({
      appid,
      duration: 300,  // 单位秒，默认1小时
    }),
  })
    .then(res => res.json())
    .then(res => res.jwt_token);
  return result;
}

前端如何鉴权

由于前端Websocket 不支持自定义header，需要通过query参数鉴权
不同的服务需要的auth参数不同，可查看具体服务demo

export function buildFullUrl(url: string, auth: Record<string, string>) {
  const arr = [];
  for (const key in auth) {
    arr.push(`${key}=${encodeURIComponent(auth[key])}`);
  }
  return `${url}?${arr.join('&')}`;
}

// 例如
const url = buildFullUrl('wss://openspeech.bytedance.com/api/v1/tts', {
  api_jwt: token,
})

语音识别

流式小模型/大模型

小模型/大模型使用同一个类进行调用，可支持的参数不同，具体参数可以查看火山API文档

import { useRef, useState } from 'react';
import { LabASR } from 'byted-ailab-speech-sdk';
import { getToken } from './helper';
import { buildFullUrl } from './helps';

const ASRStory = () => {
  const [header, setHeader] = useState('');
  const [content, setContent] = useState('');
  const [fullResponse, setFullResponse] = useState({});
  const recordStopping = useRef(false);
  const [asrClient] = useState(
    LabASR({
      onMessage: async (text, fullData) => {
        setContent(text);
        setFullResponse(fullData);
      },
      onStart() {
        setHeader('正在录音');
        setContent('');
      },
      onClose() {
        setHeader('连接关闭');
        // 自定义socket关闭逻辑
        // stopASR();
      },
      onError() {
        setHeader('连接异常');
        // 自定义socket error逻辑
        // stopASR();
      },
    })
  );

  const startASR = async () => {
    recordStopping.current = false;
    const appid = ''
    const accessKey = ''
    const auth: Record<string, string> = {};
    // 小模型
    const token = await getToken(appid, accessKey);
    if (token) {
      auth.api_jwt = token;
    }
    const fullUrl = buildFullUrl('wss://openspeech.bytedance.com/api/v2/asr', auth);
    const cluster = '' // 集群名 tob接入查看 控制台  司内接入咨询oncall
    const workflowPunctuation = 'audio_in,resample,partition,vad,fe,decode,nlu_punctuate';
    const params = {
      url: fullUrl,
      config: {
        app: {
          appid: appid,
          token: 'access token',
          cluster: cluster,
        },
        user: {
          uid: 'xxx', // 业务方用户自定义, 方便问题排查
        },
        audio: {
          format: 'pcm',
          rate: 16000,
          bits: 16,
          channel: 1,
        },
        request: {
          reqid: uuid(),
          workflow: workflowPunctuation,
          sequence: 1,
        },
      },
    };
    // 大模型
    // const token = await getToken(appid, accessKey);
    // if (token) {
    //   auth.api_resource_id = 'volc.bigasr.sauc.duration';
    //   auth.api_app_key = appid;
    //   auth.api_access_key = `Jwt; ${token}`;
    // }

    // const fullUrl = buildFullUrl(`wss://openspeech.bytedance.com/api/v3/sauc/bigmodel`, auth);
    // const params = {
    //   url: fullUrl,
    //   config: {
    //     user: {
    //       uid: 'byted sdk demo',
    //     },
    //     audio: {
    //       format: 'pcm',
    //       rate: 16000,
    //       bits: 16,
    //       channel: 1,
    //     },
    //     request: {
    //       model_name: 'bigmodel',
    //       show_utterances: true,
    //     },
    //   },
    // };
    asrClient.connect(params);
    await asrClient.startRecord({}, async (recordResult: Blob) => {
      // 音频采集数据，可用于音频可视化需要
      // const rawData = await recordResult.arrayBuffer();
      // console.warn('rawData:', rawData);
    });
  };
  const stopASR = () => {
    // 正在关闭中...
    if (recordStopping.current) {
      return;
    }
    recordStopping.current = true;
    asrClient.stopRecord();
  };
  return (
    <div>
      <button id='start' onClick={startASR}>
        开始说话
      </button>
      <button id='stop' onClick={stopASR}>
        结束说话
      </button>
      <div id='text-header'>{header}</div>
      <div id='text-content'>{content}</div>
      <pre>{JSON.stringify(fullResponse, null, 2)}</pre>
    </div>
  );
};

语音合成

双向流式大模型

双向流式服务可以合成过程中，持续发送文本
没有文本发送时，需要关闭session和连接

import { useRef, useState } from 'react';
import { BidirectionalTTS } from 'byted-ailab-speech-sdk';
import { getToken } from './helper';
const BidirectionalTTSStory = () => {
  const [audioUrl, setAudioUrl] = useState('');
  const downloadCache = useRef(new Uint8Array(0));
  const isServerError = useRef(false);
  const sendTextRef = useRef<(text: string) => void>();

  const client = useRef(BidirectionalTTS());
  const startTTS = async () => {
    setAudioUrl('');
    downloadCache.current = new Uint8Array(0);
    // 克隆音色服务
    // const speaker = 'xxx';  // 控制台查看
    // const resourceId = 'volc.megatts.default';

    // 标准音色服务
    const speaker = 'xxx';
    const resourceId = 'volc.service_type.10029';
    const appid = '';
    const accessKey = '';
    const auth: Record<string, string> = {};
    // tob通过query鉴权
    const token = await getToken(appid, accessKey);
    if (token) {
      auth.api_resource_id = resourceId;
      auth.api_app_key = appid;
      auth.api_access_key = `Jwt; ${token}`;
    }

    const fullUrl = buildFullUrl(`wss://openspeech.bytedance.com/api/v3/tts/bidirection`, auth);
    const audioUrl = client.current.start({
      debug: true,
      url: fullUrl,
      config: {
        user: {
          uid: 'bidirectional tts sdk DEMO', // 业务自定义,
        },
        namespace: 'BidirectionalTTS',
        req_params: {
          speaker,
          audio_params: {
            format: 'mp3',
            sample_rate: 24000,
          },
        },
      },
      onStart: () => {
        isServerError.current = false;
      },
      onMessage: (audioBuffer: ArrayBuffer) => {
        // 下载缓存音频二进制包
        const newDownloadCache = new Uint8Array(downloadCache.current.byteLength + audioBuffer.byteLength);
        newDownloadCache.set(downloadCache.current, 0);
        newDownloadCache.set(new Uint8Array(audioBuffer), downloadCache.current.byteLength);
        downloadCache.current = newDownloadCache;
      },

      onSessionStarted: () => {
        // sendText的调用需要在onSessionStarted之后调用
        // 第一句文本可以在这里进行发送
        client.current.sendText('测试文本。');
        // 没有文本发送后可以直接结束
        // client.current.finishSession();
      },
      onError: err => {
        console.warn('err:', err);
      },
      onClose: () => {
        // 结束后自动下载音频
        // downloadAudio();
      },
      onTTSSentenceStart(val) {
        console.info('onTTSSentenceStart:', val);
      },
      onTTSSentenceEnd(val) {
        console.info('onTTSSentenceEnd:', val);
      },
    });
    setAudioUrl(audioUrl);
    sendTextRef.current = sendText;
  };

  function sendText(val: string) {
    sendTextRef.current?.(val);
  }

  function downloadAudio() {
    const blob = new Blob([downloadCache.current]);
    const blobUrl = URL.createObjectURL(blob);
    const aTag = document.createElement('a');
    aTag.download = `${Date.now()}.mp3`;
    aTag.href = blobUrl;
    aTag.click();
    URL.revokeObjectURL(blobUrl);
  }
  const [text, setText] = useState('今天天气怎么样？');
  return (
    <div>
      <audio src={audioUrl} controls />
      <div style={{ display: 'flex', gap: 10, marginTop: 10 }}>
        <button onClick={() => startTTS()}>开始合成</button>

        <button onClick={() => client.current.finishSession()}>finishSession</button>
        <button onClick={() => client.current.finishConnection()}>finishConnection</button>
        <button onClick={() => downloadAudio()}>下载音频</button>
      </div>
      <div style={{ marginTop: 10 }}>
        <input value={text} onChange={e => setText(e.target.value)} />
        <button onClick={() => client.current.sendText(text)}>持续合成</button>
      </div>
    </div>
  );
};

单向流式小模型/大模型

import { useRef, useState } from 'react';
import { LabTTS } from 'byted-ailab-speech-sdk';
import { getToken } from './helper';  // 接入方自定义
import { buildFullUrl } from './helps';
 const TTSStory = () => {
  const [audioUrl, setAudioUrl] = useState('');
  // 如有需要，可以缓存音频数据
  const downloadCache = useRef(new Uint8Array(0));
  const isServerError = useRef(false);

  const startTTS = async () => {
    setAudioUrl('');
    downloadCache.current = new Uint8Array(0);
    const text_type = 'plain';
    const submitText = '你好啊，我是字节跳动的人工智能实验室语音合成技术负责人';
    const speaker = 'BV001_streaming';
    const auth: Record<string, string> = {};
    // tob通过query鉴权
    const appid = ''
    const accessKey = ''
    const cluster = ''
    const token = await getToken(appid, accessKey);
    if (token) {
      auth.api_jwt = token;
    }
    const url = 'wss://openspeech.bytedance.com/api/v1/tts/ws_binary';
    const serviceUrl = buildFullUrl(url, auth);
    const audioUrl = LabTTS().start({
      debug: true,
      url: serviceUrl,
      config: {
        app: {
          appid: appid,
          token: 'access_token', 
          cluster: cluster,
        },
        user: {
          uid: 'byted sdk DEMO', // 业务自定义,
        },
        audio: {
          encoding: 'mp3',
          rate: 24000,
          voice_type: speaker,
        },
        request: {
          reqid: uuid(),
          text: submitText,
          text_type,
          operation: 'submit',
        },
      },
      onStart: () => {
        isServerError.current = false;
      },
      onMessage: (audioBuffer: ArrayBuffer) => {
        // 下载缓存音频二进制包
        const newDownloadCache = new Uint8Array(downloadCache.current.byteLength + audioBuffer.byteLength);
        newDownloadCache.set(downloadCache.current, 0);
        newDownloadCache.set(new Uint8Array(audioBuffer), downloadCache.current.byteLength);
        downloadCache.current = newDownloadCache;
      },
      onError: err => {
        console.warn('err:', err);
      },
      onClose: () => {
        // downloadAudio();
      },
    });
    setAudioUrl(audioUrl);
  };

  function downloadAudio() {
    const blob = new Blob([downloadCache.current]);
    const blobUrl = URL.createObjectURL(blob);
    const aTag = document.createElement('a');
    aTag.download = 'test.mp3';
    aTag.href = blobUrl;
    aTag.click();
    URL.revokeObjectURL(blobUrl);
  }
  return (
    <div>
      <pre>{JSON.stringify(config, null, 2)}</pre>
      <audio src={audioUrl} controls />
      <button onClick={() => startTTS()}>开始合成</button>
      <button onClick={() => downloadAudio()}>下载音频</button>
    </div>
  );
};

Published

Vulnerabilities

Links

Maintainers

Keywords

Readme