【AI】二、spring ai 调用微软云 Azure 的 gtp4-mini-tts聊天模型TTS模型（文字转音频）

本文介绍了微软Azure语音服务的部署与使用方法。首先展示语音服务配置界面，包括文本转语音和语音转文本的API端点、区域及密钥等参数。文中提供了Windows系统的cURL请求模板和调试可视化界面操作方法。重点介绍了Java代码实现文本转语音功能，包括接口参数配置、SSML语音参数设置、OkHttp请求构建及响应处理流程。代码示例展示了如何通过Azure语音服务API将文本转换为音频文件并保存为M

qq_24923619

354人浏览 · 2025-08-20 09:40:21

qq_24923619 · 2025-08-20 09:40:21 发布

一、先部署语音服务

在这里插入图片描述

二、需要的配置参数

端点：
文本转语音 https://eastus2.tts.speech.microsoft.com
语音转文本 https://eastus2.stt.speech.microsoft.com
区域 eastus2
秘钥： 1IjIIlFN9jT3U0auo46n7Xyu9NGJ6Soxxxxxxxxxxxxxxxx
接口说明文档
https://learn.microsoft.com/zh-cn/azure/ai-services/speech-service/get-started-text-to-speech?tabs=windows&pivots=programming-language-rest

在这里插入图片描述

windows 请求官方模版

curl --location --request POST "https://%SPEECH_REGION%.tts.speech.microsoft.com/cognitiveservices/v1" ^
--header "Ocp-Apim-Subscription-Key: %SPEECH_KEY%" ^
--header "Content-Type: application/ssml+xml" ^
--header "X-Microsoft-OutputFormat: audio-16khz-128kbitrate-mono-mp3" ^
--header "User-Agent: curl" ^
--data-raw "<speak version='1.0' xml:lang='en-US'><voice xml:lang='en-US' xml:gender='Female' name='en-US-AvaMultilingualNeural'>my voice is my passport verify me</voice></speak>" --output output.mp3

三、调试文字转音频（可视化）

点击【在广场中打开】可以调试文字转音频

在这里插入图片描述

四、java代码

package com.xiaozhi.tts;

import okhttp3.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.concurrent.TimeUnit;

public class AzureTTS_Test {
    private static final Logger logger = LoggerFactory.getLogger(AzureTTS_Test.class);

    public static void main(String[] args) throws Exception {

        //============================接口相关参数============================
        //地区
        String speechRegion = "eastus2";
        //秘钥
        String speechKey = "1IjIIlFN9jT3U0auo46n7Xyu9NGJ6SoxOGTdu5";
        // 构建请求URL
        String apiUrl = "https://"+ speechRegion +".tts.speech.microsoft.com/cognitiveservices/v1";

        //输出文件路径+文件名
        String outputFilePath = "output.mp3";
        //输出文件格式
        String outputFormat = "audio-16khz-128kbitrate-mono-mp3";

        // ====================SSML配置参数(语音相关参数)====================
        String xmlLang = "en-US";    //指定语言
        String voiceGender = "Female"; //指定语音性别（男声 / 女声等）
        String voiceName = "en-US-AvaMultilingualNeural"; // 指定具体的语音名称（通常对应特定的语音模型）
        String textToSpeak = "需要转音频的文本";  // 需要转音频的文本


        // ====================其他参数====================
        String userAgent = "curl"; //请求方式
        // 构建SSML请求体 (语音相关参数xml信息)
        String ssmlRequestBody = "<speak version='1.0' xml:lang='"+ xmlLang +"'><voice xml:lang='"+ xmlLang +"' xml:gender='"+ voiceGender +"' name='"+ voiceName +"'>"+ textToSpeak +"</voice></speak>";

        // 调用API
        callTextToSpeechApi(apiUrl, speechKey, outputFormat, userAgent, ssmlRequestBody, outputFilePath);

    }

    /**
     * 调用文本转语音API并保存结果到文件
     * @param apiUrl 接口地址
     * @param speechKey  秘钥
     * @param outputFormat 输出格式 audio-16khz-128kbitrate-mono-mp3
     * @param userAgent 请求方式
     * @param ssmlRequestBody 语音相关参数xml信息
     * @param outputFilePath 输出文件路径+文件名
     */
    private static boolean callTextToSpeechApi(String apiUrl, String speechKey, String outputFormat, String userAgent, String ssmlRequestBody, String outputFilePath) throws Exception {
        try {
            // 创建OkHttpClient实例
            OkHttpClient client = new OkHttpClient.Builder()
                    .connectTimeout(30, TimeUnit.SECONDS)
                    .readTimeout(30, TimeUnit.SECONDS)
                    .writeTimeout(30, TimeUnit.SECONDS)
                    .build();

            // 创建请求体
            RequestBody requestBody = RequestBody.create(ssmlRequestBody, MediaType.parse("application/ssml+xml"));

            // 构建请求
            Request request = new Request.Builder()
                    .url(apiUrl)
                    .post(requestBody)
                    .addHeader("Ocp-Apim-Subscription-Key", speechKey)
                    .addHeader("Content-Type", "application/ssml+xml")
                    .addHeader("X-Microsoft-OutputFormat", outputFormat)
                    .addHeader("User-Agent", userAgent)
                    .build();


            try (Response response = client.newCall(request).execute()) {
                try (ResponseBody responseBody = response.body()) {
                    if (!response.isSuccessful()) {
                        String errorDetails = responseBody != null ? responseBody.string() : "无详细信息";

                        System.err.println("请求失败: " + response.code() + " " + response.message());
                        System.err.println("错误详情: " + errorDetails);
                        return false;
                    }

                    if (responseBody == null) {
                        System.err.println("响应体为空");
                        return false;
                    }

                    // 保存音频文件
                    try (InputStream inputStream = responseBody.byteStream();
                         OutputStream outputStream = new FileOutputStream(outputFilePath)) {

                        byte[] buffer = new byte[4096];
                        int bytesRead;
                        while ((bytesRead = inputStream.read(buffer)) != -1) {
                            outputStream.write(buffer, 0, bytesRead);
                        }

                        File outputFile = new File(outputFilePath);

                        System.out.println("语音文件已成功保存至: " + outputFile.getAbsolutePath());

                    } catch (IOException e) {
                        logger.error("【TTS文本转音频错误】保存文件失败: {}", e.getMessage());
                        e.printStackTrace();
                    }
                }
            }
        }catch (Exception e){
            e.printStackTrace();
        }
        return false;
    }
}