Streaming Verification
- The following example shows how to stream audio using VoiceBio’s
StreamingVerifyrequest and verify whether the audio matches the provided voiceprint. The stream can come from a file on disk or be directly from a microphone in real time.
Streaming from an audio file
-
We support several headered file formats including WAV, MP3, FLAC etc. For more details, please see the protocol buffer specification here. For best accuracy, it is recommended to use an uncompressed / loss-less compression audio format like WAV or FLAC.
-
The examples below use a WAV file as input. We will query the server for available models and use the first model to score and verify given audio against a given voiceprint.
Info
Voiceprints provided inStreamingVerify requests must be generated using the
same or compatible model via StreamingEnroll.
import grpc
import cobaltspeech.voicebio.v1.voicebio_pb2_grpc as stub
import cobaltspeech.voicebio.v1.voicebio_pb2 as voicebio
serverAddress = "localhost:2727"
# Using a channel without TLS enabled.
channel = grpc.insecure_channel(serverAddress)
client = stub.VoiceBioServiceStub(channel)
# Get server version.
versionResp = client.Version(voicebio.VersionRequest())
print(versionResp)
# Get list of models on the server.
modelResp = client.ListModels(voicebio.ListModelsRequest())
print("Models:")
for model in modelResp.models:
print(model)
# Select a model ID from the list above. Going with the first model
# in this example.
modelID = modelResp.models[0].id
# Loading reference voiceprint.
with open("voiceprint.bin", 'r') as f:
voiceprint = voicebio.Voiceprint(data=f.read().strip())
# Set the verification config. We don't set the audio format and let the
# server auto-detect the format from the file header.
cfg = voicebio.VerificationConfig(
model_id=modelID,
voiceprint=voiceprint,
)
# The first request to the server should only contain the
# configuration. Subsequent requests should contain audio
# bytes. We can write a simple generator to do this.
def stream(cfg, audio, bufferSize=1024):
yield voicebio.StreamingVerifyRequest(config=cfg)
data = audio.read(bufferSize)
while len(data) > 0:
yield voicebio.StreamingVerifyRequest(audio=voicebio.Audio(data=data))
data = audio.read(bufferSize)
# Streaming audio to the server.
with open("test.wav", "rb") as audio:
resp = client.StreamingVerify(stream(cfg, audio))
# Server returns a similarity score along with whether the score
# exceeded the server-configured threshold for being a match.
print(f"Verification Score: {resp.result.similarity_score:1.3f}, Match: {resp.result.is_match}")package main
import (
"context"
"errors"
"fmt"
"io"
"os"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
voicebio "github.com/cobaltspeech/go-genproto/cobaltspeech/voicebio/v1"
)
func main() {
const (
serverAddress = "localhost:2727"
)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
opts := []grpc.DialOption{
grpc.WithTransportCredentials(insecure.NewCredentials()), // Using a channel without TLS enabled.
grpc.WithBlock(),
grpc.WithReturnConnectionError(),
grpc.FailOnNonTempDialError(true),
}
conn, err := grpc.DialContext(ctx, serverAddress, opts...)
if err != nil {
fmt.Printf("failed to dial gRPC connection: %v\n", err)
os.Exit(1)
}
client := voicebio.NewVoiceBioServiceClient(conn)
// Get server version.
versionResp, err := client.Version(ctx, &voicebio.VersionRequest{})
if err != nil {
fmt.Printf("failed to get server version: %v\n", err)
os.Exit(1)
}
fmt.Printf("%v\n", versionResp)
// Get list model of models on the server.
modelResp, err := client.ListModels(ctx, &voicebio.ListModelsRequest{})
if err != nil {
fmt.Printf("failed to get model list: %v\n", err)
os.Exit(1)
}
fmt.Println("Models:")
for _, m := range modelResp.Models {
fmt.Println(m)
}
fmt.Println()
// Reading voiceprint data.
data, err := os.ReadFile("voiceprint.bin")
if err != nil {
fmt.Printf("\nfailed to read voiceprint data: %v\n", err)
os.Exit(1)
}
// Selecting the first model.
cfg := &voicebio.VerificationConfig{
ModelId: modelResp.Models[0].Id,
Voiceprint: &voicebio.Voiceprint{Data: string(data)},
}
// Opening audio file.
audio, err := os.Open("test.wav")
if err != nil {
fmt.Printf("failed to open audio file: %v\n", err)
os.Exit(1)
}
defer audio.Close()
// Starting verification.
resp, err := StreamingVerify(ctx, client, cfg, audio)
if err != nil {
fmt.Printf("failed to run streaming verification: %v\n", err)
os.Exit(1)
}
// Server returns a similarity score along with whether the score
// exceeded the server-configured threshold for being a match.
fmt.Printf("Verification Score: %1.3f, Match: %v\n", resp.Result.SimilarityScore, resp.Result.IsMatch)
}
// StreamingVerify wraps the streaming API for performing speaker verification
// using the given cfg.
//
// Data is read from the given audio reader into a buffer and streamed to VoiceBio
// server. The default buffer size may be overridden using Options when creating
// the Client.
//
// If any error occurs while reading the audio or sending it to the server, this
// method will immediately exit, returning that error.
func StreamingVerify(
ctx context.Context,
client voicebio.VoiceBioServiceClient,
cfg *voicebio.VerificationConfig,
audio io.Reader,
) (*voicebio.StreamingVerifyResponse, error) {
const (
streamingBufSize = 1024
)
// Creating stream.
stream, err := client.StreamingVerify(ctx)
if err != nil {
return nil, err
}
// Sending audio.
if err := sendAudio(stream, cfg, audio, streamingBufSize); err != nil && !errors.Is(err, io.EOF) {
// if sendAudio encountered io.EOF, it's only a
// notification that the stream has closed. The actual
// status will be obtained in the CloseAndRecv call. We
// therefore return on non-EOF errors here.
return nil, err
}
// Returning result.
return stream.CloseAndRecv()
}
// sendAudio sends the config and audio to a stream.
func sendAudio(
stream voicebio.VoiceBioService_StreamingVerifyClient,
cfg *voicebio.VerificationConfig,
audio io.Reader,
bufSize uint32,
) error {
// The first message needs to be a config message, and all subsequent
// messages must be audio messages.
// Send the config.
if err := stream.Send(&voicebio.StreamingVerifyRequest{
Request: &voicebio.StreamingVerifyRequest_Config{Config: cfg},
}); err != nil {
// if this failed, we don't need to CloseSend
return err
}
// Stream the audio.
buf := make([]byte, bufSize)
for {
n, err := audio.Read(buf)
if n > 0 {
if err2 := stream.Send(&voicebio.StreamingVerifyRequest{
Request: &voicebio.StreamingVerifyRequest_Audio{
Audio: &voicebio.Audio{Data: buf[:n]},
},
}); err2 != nil {
// if we couldn't Send, the stream has
// encountered an error and we don't need to
// CloseSend.
return err2
}
}
if err != nil {
// err could be io.EOF, or some other error reading from
// audio. In any case, we need to CloseSend, send the
// appropriate error to errCh and return from the function
if err2 := stream.CloseSend(); err2 != nil {
return err2
}
if err != io.EOF {
return err
}
return nil
}
}
}Streaming from microphone
-
Streaming audio from microphone input basically requires a reader interface that can provided audio samples recorded from a microphone; typically this requires interaction with system libraries. Another option is to use an external command line tool like
soxto record and pipe audio into the client. -
The examples below use the latter approach by using the
reccommand provided withsoxto record and stream the audio.
#!/usr/bin/env python3
# This example assumes sox is installed on the system and is available
# in the system's PATH variable. Instead of opening a regular file from
# disk, we open a subprocess that executes sox's rec command to record
# audio from the system's default microphone.
import subprocess
import grpc
import cobaltspeech.voicebio.v1.voicebio_pb2_grpc as stub
import cobaltspeech.voicebio.v1.voicebio_pb2 as voicebio
serverAddress = "localhost:2727"
# Using a channel without TLS enabled.
channel = grpc.insecure_channel(serverAddress)
client = stub.VoiceBioServiceStub(channel)
# Get server version.
versionResp = client.Version(voicebio.VersionRequest())
print(versionResp)
# Get list of models on the server.
modelResp = client.ListModels(voicebio.ListModelsRequest())
print("Models:")
for model in modelResp.models:
print(model)
# Select a model ID from the list above. Going with the first model
# in this example.
m = modelResp.models[0]
modelID = m.id
# Loading reference voiceprint.
with open("voiceprint.bin", 'r') as f:
voiceprint = voicebio.Voiceprint(data=f.read().strip())
# Setting audio format to be raw 16-bit signed little endian audio samples
# recorded at the sample rate expected by the model.
cfg = voicebio.VerificationConfig(
model_id=modelID,
voiceprint=voiceprint,
audio_format=voicebio.AudioFormat(
audio_format_raw=voicebio.AudioFormatRAW(
encoding="AUDIO_ENCODING_SIGNED",
bit_depth=16,
byte_order="BYTE_ORDER_LITTLE_ENDIAN",
sample_rate=m.attributes.sample_rate,
channels=1,
)
),
)
# Open microphone stream using sox's rec command and record
# audio using the config specified above for *10 seconds*.
maxDuration = 10
cmd = f"rec -t raw -r {m.attributes.sample_rate} -e signed -b 16 -L -c 1 - trim 0 {maxDuration}"
mic = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
audio = mic.stdout
try:
_ = audio.read(1024) # Trying to read some bytes as sanity check.
except Exception as err:
print(f"[ERROR] failed to read audio from mic stream: {err}")
print(f"\n[INFO] recording {maxDuration} seconds of audio microphone ... \n")
# The first request to the server should only contain the
# recognition configuration. Subsequent requests should contain
# audio bytes. We can write a simple generator to do this.
def stream(cfg, audio, bufferSize=1024):
yield voicebio.StreamingVerifyRequest(config=cfg)
data = audio.read(bufferSize)
while len(data) > 0:
yield voicebio.StreamingVerifyRequest(audio=voicebio.Audio(data=data))
data = audio.read(bufferSize)
# Streaming audio to the server.
resp = client.StreamingVerify(stream(cfg, audio))
# Server returns a similarity score along with whether the score
# exceeded the server-configured threshold for being a match.
print(f"Verification Score: {resp.result.similarity_score:1.3f}, Match: {resp.result.is_match}")
audio.close()
mic.kill()package main
import (
"context"
"errors"
"fmt"
"io"
"os"
"os/exec"
"strings"
"golang.org/x/sync/errgroup"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
voicebio "github.com/cobaltspeech/go-genproto/cobaltspeech/voicebio/v1"
)
func main() {
const (
serverAddress = "localhost:2727"
)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
opts := []grpc.DialOption{
grpc.WithTransportCredentials(insecure.NewCredentials()), // Using a channel without TLS enabled.
grpc.WithBlock(),
grpc.WithReturnConnectionError(),
grpc.FailOnNonTempDialError(true),
}
conn, err := grpc.DialContext(ctx, serverAddress, opts...)
if err != nil {
fmt.Printf("failed to dial gRPC connection: %v\n", err)
os.Exit(1)
}
client := voicebio.NewVoiceBioServiceClient(conn)
// Get server version.
versionResp, err := client.Version(ctx, &voicebio.VersionRequest{})
if err != nil {
fmt.Printf("failed to get server version: %v\n", err)
os.Exit(1)
}
fmt.Printf("%v\n", versionResp)
// Get list model of models on the server.
modelResp, err := client.ListModels(ctx, &voicebio.ListModelsRequest{})
if err != nil {
fmt.Printf("failed to get model list: %v\n", err)
os.Exit(1)
}
fmt.Println("Models:")
for _, m := range modelResp.Models {
fmt.Println(m)
}
fmt.Println()
// Selecting first model.
m := modelResp.Models[0]
// Reading voiceprint data.
data, err := os.ReadFile("voiceprint.bin")
if err != nil {
fmt.Printf("\nfailed to read voiceprint data: %v\n", err)
os.Exit(1)
}
// Setting audio format to be raw 16-bit signed little endian audio samples
// recorded at the sample rate expected by the model.
cfg := &voicebio.VerificationConfig{
ModelId: m.Id,
Voiceprint: &voicebio.Voiceprint{Data: string(data)},
AudioFormat: &voicebio.AudioFormat{AudioFormat: &voicebio.AudioFormat_AudioFormatRaw{
AudioFormatRaw: &voicebio.AudioFormatRAW{
Encoding: voicebio.AudioEncoding_AUDIO_ENCODING_SIGNED,
SampleRate: m.Attributes.SampleRate,
BitDepth: 16,
ByteOrder: voicebio.ByteOrder_BYTE_ORDER_LITTLE_ENDIAN,
Channels: 1,
},
},
},
}
// Open microphone stream using sox's rec command and record
// audio using the config specified above for *10 seconds*.
maxDuration := 10
args := fmt.Sprintf("-t raw -r %d -e signed -b 16 -L -c 1 - trim 0 %d", m.Attributes.SampleRate, maxDuration)
cmd := exec.CommandContext(ctx, "rec", strings.Fields(args)...)
cmd.Stderr = os.Stderr
audio, err := cmd.StdoutPipe()
if err != nil {
fmt.Printf("failed to open microphone stream: %v\n", err)
os.Exit(1)
}
// Starting routines to record from microphone and stream to server
// using an errgroup.Group that returns if either one encounters an error.
eg, ctx := errgroup.WithContext(ctx)
eg.Go(func() error {
fmt.Printf("\n[INFO] recording %d seconds from microphone \n", maxDuration)
if err := cmd.Run(); err != nil {
return fmt.Errorf("record from microphone: %w", err)
}
return nil
})
// Starting verification.
resp, err := StreamingVerify(ctx, client, cfg, audio)
if err != nil {
fmt.Printf("failed to run streaming verification: %v\n", err)
os.Exit(1)
}
// Server returns a similarity score along with whether the score
// exceeded the server-configured threshold for being a match.
fmt.Printf("Verification Score: %1.3f, Match: %v\n", resp.Result.SimilarityScore, resp.Result.IsMatch)
}
// StreamingVerify wraps the streaming API for performing speaker verification
// using the given cfg.
//
// Data is read from the given audio reader into a buffer and streamed to VoiceBio
// server. The default buffer size may be overridden using Options when creating
// the Client.
//
// If any error occurs while reading the audio or sending it to the server, this
// method will immediately exit, returning that error.
func StreamingVerify(
ctx context.Context,
client voicebio.VoiceBioServiceClient,
cfg *voicebio.VerificationConfig,
audio io.Reader,
) (*voicebio.StreamingVerifyResponse, error) {
const (
streamingBufSize = 1024
)
// Creating stream.
stream, err := client.StreamingVerify(ctx)
if err != nil {
return nil, err
}
// Sending audio.
if err := sendAudio(stream, cfg, audio, streamingBufSize); err != nil && !errors.Is(err, io.EOF) {
// if sendAudio encountered io.EOF, it's only a
// notification that the stream has closed. The actual
// status will be obtained in the CloseAndRecv call. We
// therefore return on non-EOF errors here.
return nil, err
}
// Returning result.
return stream.CloseAndRecv()
}
// sendAudio sends the config and audio to a stream.
func sendAudio(
stream voicebio.VoiceBioService_StreamingVerifyClient,
cfg *voicebio.VerificationConfig,
audio io.Reader,
bufSize uint32,
) error {
// The first message needs to be a config message, and all subsequent
// messages must be audio messages.
// Send the config.
if err := stream.Send(&voicebio.StreamingVerifyRequest{
Request: &voicebio.StreamingVerifyRequest_Config{Config: cfg},
}); err != nil {
// if this failed, we don't need to CloseSend
return err
}
// Stream the audio.
buf := make([]byte, bufSize)
for {
n, err := audio.Read(buf)
if n > 0 {
if err2 := stream.Send(&voicebio.StreamingVerifyRequest{
Request: &voicebio.StreamingVerifyRequest_Audio{
Audio: &voicebio.Audio{Data: buf[:n]},
},
}); err2 != nil {
// if we couldn't Send, the stream has
// encountered an error and we don't need to
// CloseSend.
return err2
}
}
if err != nil {
// err could be io.EOF, or some other error reading from
// audio. In any case, we need to CloseSend, send the
// appropriate error to errCh and return from the function
if err2 := stream.CloseSend(); err2 != nil {
return err2
}
if err != io.EOF {
return err
}
return nil
}
}
}