前言:该UE5.1项目实现了离线实时语音转文字并朗读输出结果的功能,能作为一个实现参考。

1.准备工作:

  • 下载我打包好的插件、环境、模型:baiduyun更多模型下载

  • 安装UE5.1引擎,VS开发环境&编译器:Epic LauncherMicrosoft VS

  • 新建空白C++项目后关闭引擎,并打开项目文件夹:

  • 项目文件夹中放入下载的插件与语言模型:

                1.项目目录/Plugins文件夹中放入下载解压好VoskPlugin插件

                2.项目目录/Vosk文件夹中放入下载解压好的语言服务器

                3.项目目录/Vosk/install/Models文件夹中放入解压好的大,小中文模型

2.各项设置:

  • 修改配置,项目支持语音: 项目目录/Config文件夹中修改DefaultEngine.ini,末尾添加配置项

  • [Voice]
    bEnabled=true
    [SystemSettings]
    voice.SilenceDetectionThreshold=0.01
  • 打开项目并在菜单栏-工具-新建C++类KTTKComponent(继承自VoskComponent)到Vosk插件中:

  • KTTKComponent.h中实现服务器初始化,各项默认配置,开启/关闭识别函数:

#pragma once

#include "CoreMinimal.h"
#include "Components/ActorComponent.h"

#include "VoskComponent.h"
#include "VoskServerParameters.h"
#include "ProcessHandleWrapper.h"
#include "Engine/World.h"
#include "TimerManager.h"
#include "Kismet/KismetSystemLibrary.h"

#include "KTTKComponent.generated.h"


UCLASS( ClassGroup=(Custom), meta=(BlueprintSpawnableComponent) )
class VOSKPLUGIN_API UKTTKComponent : public UVoskComponent
{
	GENERATED_BODY()

public:	
	UKTTKComponent();

public:
	FString ModelPath;
	bool BuildVoskSucess = false;
	TArray<FString> CommandLineArgs;

	UPROPERTY(BlueprintReadWrite,EditAnywhere,category = "AI",meta = (displayName="使用AI语言大模型"))
	bool UseBigModel = false;

	UPROPERTY(BlueprintReadWrite,EditAnywhere,category = "AI",meta = (displayName="AI语言大模型路径"))
	FString BigModelPath = UKismetSystemLibrary::GetProjectDirectory() + "Vosk/install/Models/vosk-model-cn-0.22";

	UPROPERTY(BlueprintReadWrite,EditAnywhere,category = "AI",meta = (displayName="AI语言小模型路径"))
	FString SmallModelPath = UKismetSystemLibrary::GetProjectDirectory() + "Vosk/install/Models/vosk-model-small-cn-0.22";

	UPROPERTY(BlueprintReadWrite,EditAnywhere,category = "AI",meta = (displayName="识别服务器程序"))
	FString VoskServerExe = UKismetSystemLibrary::GetProjectDirectory() + "Vosk/install/asr_server.exe";

	UPROPERTY(BlueprintReadWrite,EditAnywhere,category = "AI",meta = (displayName="识别服务器IP"))
	FString VoskServerIP = "127.0.0.1";

	UPROPERTY(BlueprintReadWrite,EditAnywhere,category = "AI",meta = (displayName="识别服务器端口", ClampMin = "1024", ClampMax = "65535"))
	int32 VoskServerPort = 25565;

	UPROPERTY(BlueprintReadWrite,category = "AI")//识别服务器配置
	FVoskServerParameters serverconfig;

	UPROPERTY(BlueprintReadOnly,category = "AI")//识别服务进程Handle
	FProcessHandleWrapper ProcessHandleVosk;

	UFUNCTION(BlueprintCallable,category = "AI",meta = (displayName = "开始识别"))
	void Start(FString iDeviceNameIn);

	UFUNCTION(BlueprintCallable,category = "AI",meta = (displayName = "停止识别"))
	void End(TArray<uint8>& CaptureData,int32& SamplesRecorded);

protected:
	FTimerHandle DelayTimeHandle;
	void DelayTimmer();
	void initlazi();
	void InitlaziVosk();

	virtual void BeginPlay() override;
	virtual void EndPlay(const EEndPlayReason::Type EndPlayReason) override;

public:	
	virtual void TickComponent(float DeltaTime, ELevelTick TickType, FActorComponentTickFunction* ThisTickFunction) override;
};
  • KTTKComponent中具体实现:

#include "KTTKComponent.h"

UKTTKComponent::UKTTKComponent()
{
	PrimaryComponentTick.bCanEverTick = true;
}

void UKTTKComponent::BeginPlay()
{
	Super::BeginPlay();

	if(UseBigModel==true)//Use Chinese Big Model or Small Model(使用中文语言大模型/小模型)
	{
		ModelPath = BigModelPath;
	}else{ModelPath = SmallModelPath;};
	FString FullPathOfProgramToRun = VoskServerExe;//服务器执行程序路径 

	serverconfig.PathToModel = ModelPath;//设置语言模型路径

	CommandLineArgs = BuildServerParameters(serverconfig,BuildVoskSucess);//创建执行命令

	CreateProcessV(ProcessHandleVosk,FullPathOfProgramToRun,CommandLineArgs,false,true,0);//创建执行识别进程

	initlazi();//进行识别服务器初始化
}

void UKTTKComponent::DelayTimmer()
{
	GetWorld()->GetTimerManager().SetTimer(DelayTimeHandle,this,&UKTTKComponent::InitlaziVosk,5.0f,false);
}

void UKTTKComponent::initlazi()
{
	if(IsInitialized()==false)//是否已经初始化识别服务
	{
		DelayTimmer();//延迟5秒执行开启识别服务器
	}else{return;};
}

void UKTTKComponent::InitlaziVosk()
{
	Initialize(VoskServerIP,VoskServerPort);
	GetWorld()->GetTimerManager().ClearTimer(DelayTimeHandle);//清除定时handle
}

void UKTTKComponent::Start(FString iDeviceNameIn)
{
	BeginCapture(iDeviceNameIn);//开始录制对话
}

void UKTTKComponent::End(TArray<uint8>& CaptureData,int32& SamplesRecorded)
{
	FinishCapture(CaptureData,SamplesRecorded);//结束录制对话
}

void UKTTKComponent::EndPlay(const EEndPlayReason::Type EndPlayReason)
{
	Super::EndPlay(EndPlayReason);
	if(EndPlayReason == EEndPlayReason::Type::Quit || EndPlayReason == EEndPlayReason::Type::EndPlayInEditor || EndPlayReason == EEndPlayReason::Type::Destroyed)
	{
		KillProcess(ProcessHandleVosk);//停止识别服务器
	}
}

void UKTTKComponent::TickComponent(float DeltaTime, ELevelTick TickType, FActorComponentTickFunction* ThisTickFunction)
{
	Super::TickComponent(DeltaTime, TickType, ThisTickFunction);
}

3.编译并测试:

  • 生成并打开该项目:

  • 蓝图中创建游戏模式GM_STTS,玩家控制器PC_STTS:

        1.该游戏模式中选定该玩家控制器,并将该游戏模式设置到当前地图的世界场景设置;
        2.玩家控制器中添加组件:KTTK,设置默认配置值(是否使用大的语言模型,模型自定义路径,本机或远程语言服务器IP端口),实现测试蓝图逻辑;
  • 电脑插入麦克风,运行关卡,按下测试按键即可测试回调输出打印字符。

  • 输出最终识别结果时朗读结果:蓝图中的TTSSpeech来自另一插件-TTSPluginMeoPlay(请自行寻找该插件)。

4.项目构建流程图:


后言:该项目实现了语言实时转文字,且能通过调用系统音色朗读输出内容。识别效果准确率根据模型精度而定,模型加载到内存So还需要考虑硬件需求。希望此文章能帮到你!
Logo

有“AI”的1024 = 2048,欢迎大家加入2048 AI社区

更多推荐