Streaming Identification

Describes how to stream audio to VoiceBio server for identification using given voiceprints.

The following example shows how to stream audio using VoiceBio’s StreamingIdentify request and identify the speaker in the audio using provided voiceprints. The stream can come from a file on disk or be directly from a microphone in real time.

Info

If you want to compare against a large number of voiceprints in multiple batches, it will be more efficient to extract the voiceprint from the audio once using the StreamingEnroll request, and then compare voiceprints directly without audio via the CompareVoiceprints request.

Streaming from an audio file

We support several headered file formats including WAV, MP3, FLAC etc. For more details, please see the protocol buffer specification here. For best accuracy, it is recommended to use an uncompressed / loss-less compression audio format like WAV or FLAC.
The examples below use a WAV file as input. We will query the server for available models and use the first model to score and identify the given audio against a given set of voiceprints.

Info

Voiceprints provided in StreamingIdentify requests must be generated using the same or compatible model via StreamingEnroll.

Python
Go

import grpc
import cobaltspeech.voicebio.v1.voicebio_pb2_grpc as stub
import cobaltspeech.voicebio.v1.voicebio_pb2 as voicebio

serverAddress = "localhost:2727"

# Using a channel without TLS enabled.
channel = grpc.insecure_channel(serverAddress)
client = stub.VoiceBioServiceStub(channel)

# Get server version.
versionResp = client.Version(voicebio.VersionRequest())
print(versionResp)

# Get list of models on the server.
modelResp = client.ListModels(voicebio.ListModelsRequest())

print("Models:")
for model in modelResp.models:
    print(model)

# Select a model ID from the list above. Going with the first model
# in this example.
modelID = modelResp.models[0].id

# Loading reference voiceprints.
voiceprints = []
for p in ["user1.bin", "user2.bin", "user3.bin"]:
    with open(p, 'r') as f:
        voiceprints.append(voicebio.Voiceprint(data=f.read().strip()))

# Set the identification config. We don't set the audio format and let the
# server auto-detect the format from the file header.
cfg = voicebio.IdentificationConfig(
    model_id=modelID,
    voiceprints=voiceprints,
)

# The first request to the server should only contain the
# configuration. Subsequent requests should contain audio
# bytes. We can write a simple generator to do this.
def stream(cfg, audio, bufferSize=1024):
    yield voicebio.StreamingIdentifyRequest(config=cfg)
    
    data = audio.read(bufferSize)
    while len(data) > 0:
        yield voicebio.StreamingIdentifyRequest(audio=voicebio.Audio(data=data))
        data = audio.read(bufferSize)

# Streaming audio to the server.
with open("test.wav", "rb") as audio:
  result = client.StreamingIdentify(stream(cfg, audio))

# Server returns the index of the voiceprint that matches the best, a similarity
# score for each voiceprint along with whether the score exceeded the server-configured
# threshold for being a match.
#
# If none of the voiceprints were a good match, the best match index will be negative.
matched = "❌ No Match found"
if result.best_match_index >= 0:
    best_score = result.voiceprint_comparison_results[result.best_match_index].similarity_score
    matched = f"✅ Match found: Index: {result.best_match_index}, Score: {best_score:1.3f}"

print(f"\nIdentification Result:\n")

print("Scores:")
for i, r in enumerate(result.voiceprint_comparison_results):
    print(f"Index: {i}, Score: {r.similarity_score:1.3f}, IsMatch: {r.is_match}")

print(f"\n{matched}")

package main

import (
	"context"
	"errors"
	"fmt"
	"io"
	"os"

	"google.golang.org/grpc"
	"google.golang.org/grpc/credentials/insecure"

	voicebio "github.com/cobaltspeech/go-genproto/cobaltspeech/voicebio/v1"
)

func main() {
	const (
		serverAddress = "localhost:2727"
	)

	ctx, cancel := context.WithCancel(context.Background())
	defer cancel()

	opts := []grpc.DialOption{
		grpc.WithTransportCredentials(insecure.NewCredentials()), // Using a channel without TLS enabled.
		grpc.WithBlock(),
		grpc.WithReturnConnectionError(),
		grpc.FailOnNonTempDialError(true),
	}

	conn, err := grpc.DialContext(ctx, serverAddress, opts...)
	if err != nil {
		fmt.Printf("failed to dial gRPC connection: %v\n", err)
		os.Exit(1)
	}

	client := voicebio.NewVoiceBioServiceClient(conn)

	// Get server version.
	versionResp, err := client.Version(ctx, &voicebio.VersionRequest{})
	if err != nil {
		fmt.Printf("failed to get server version: %v\n", err)
		os.Exit(1)
	}

	fmt.Printf("%v\n", versionResp)

	// Get list model of models on the server.
	modelResp, err := client.ListModels(ctx, &voicebio.ListModelsRequest{})
	if err != nil {
		fmt.Printf("failed to get model list: %v\n", err)
		os.Exit(1)
	}

	fmt.Println("Models:")
	for _, m := range modelResp.Models {
		fmt.Println(m)
	}
	fmt.Println()

	// Reading voiceprint data.
	voiceprints := make([]*voicebio.Voiceprint, 0)

	for i, p := range []string{"user1.bin", "user2.bin", "user3.bin"} {
		data, err := os.ReadFile(p)
		if err != nil {
			fmt.Printf("\nfailed to read voiceprint[%d] data: %v\n", i, err)
			os.Exit(1)
		}

		voiceprints = append(voiceprints, &voicebio.Voiceprint{Data: string(data)})
	}

	// Selecting the first model.
	cfg := &voicebio.IdentificationConfig{
		ModelId:     modelResp.Models[0].Id,
		Voiceprints: voiceprints,
	}

	// Opening audio file.
	audio, err := os.Open("test.wav")
	if err != nil {
		fmt.Printf("failed to open audio file: %v\n", err)
		os.Exit(1)
	}

	defer audio.Close()

	// Starting identification.
	result, err := StreamingIdentify(ctx, client, cfg, audio)
	if err != nil {
		fmt.Printf("failed to run streaming identification: %v\n", err)
		os.Exit(1)
	}

	// Server returns the index of the voiceprint that matches the best, a similarity
	// score for each voiceprint along with whether the score exceeded the server-configured
	// threshold for being a match.
	//
	// If none of the voiceprints were a good match, the best match index will be negative.
	matched := "❌ No Match found"
	if result.BestMatchIndex >= 0 {
		bestScore := result.VoiceprintComparisonResults[result.BestMatchIndex].SimilarityScore
		matched = fmt.Sprintf("✅ Match found: Index: %d, Score: %1.3f", result.BestMatchIndex, bestScore)
	}

	fmt.Printf("\nIdentification Result:\n")

	fmt.Printf("Scores:\n")
	for i, r := range result.VoiceprintComparisonResults {
		fmt.Printf("Index: %d, Score: %1.3f, IsMatch: %v\n", i, r.SimilarityScore, r.IsMatch)
	}

	fmt.Printf("\n%s\n", matched)
}

// StreamingIdentify wraps the streaming API for performing speaker identification
// using the given cfg.
//
// Data is read from the given audio reader into a buffer and streamed to VoiceBio
// server. The default buffer size may be overridden using Options when creating
// the Client.
//
// If any error occurs while reading the audio or sending it to the server, this
// method will immediately exit, returning that error.
func StreamingIdentify(
	ctx context.Context,
	client voicebio.VoiceBioServiceClient,
	cfg *voicebio.IdentificationConfig,
	audio io.Reader,
) (*voicebio.StreamingIdentifyResponse, error) {
	const (
		streamingBufSize = 1024
	)

	// Creating stream.
	stream, err := client.StreamingIdentify(ctx)
	if err != nil {
		return nil, err
	}

	// Sending audio.
	if err := sendAudio(stream, cfg, audio, streamingBufSize); err != nil && !errors.Is(err, io.EOF) {
		// if sendAudio encountered io.EOF, it's only a
		// notification that the stream has closed.  The actual
		// status will be obtained in the CloseAndRecv call. We
		// therefore return on non-EOF errors here.
		return nil, err
	}

	// Returning result.
	return stream.CloseAndRecv()
}

// sendAudio sends the config and audio to a stream.
func sendAudio(
	stream voicebio.VoiceBioService_StreamingIdentifyClient,
	cfg *voicebio.IdentificationConfig,
	audio io.Reader,
	bufSize uint32,
) error {
	// The first message needs to be a config message, and all subsequent
	// messages must be audio messages.

	// Send the config.
	if err := stream.Send(&voicebio.StreamingIdentifyRequest{
		Request: &voicebio.StreamingIdentifyRequest_Config{Config: cfg},
	}); err != nil {
		// if this failed, we don't need to CloseSend
		return err
	}

	// Stream the audio.
	buf := make([]byte, bufSize)
	for {
		n, err := audio.Read(buf)
		if n > 0 {
			if err2 := stream.Send(&voicebio.StreamingIdentifyRequest{
				Request: &voicebio.StreamingIdentifyRequest_Audio{
					Audio: &voicebio.Audio{Data: buf[:n]},
				},
			}); err2 != nil {
				// if we couldn't Send, the stream has
				// encountered an error and we don't need to
				// CloseSend.
				return err2
			}
		}

		if err != nil {
			// err could be io.EOF, or some other error reading from
			// audio.  In any case, we need to CloseSend, send the
			// appropriate error to errCh and return from the function
			if err2 := stream.CloseSend(); err2 != nil {
				return err2
			}

			if err != io.EOF {
				return err
			}

			return nil
		}
	}
}

Streaming from microphone

Streaming audio from microphone input basically requires a reader interface that can provided audio samples recorded from a microphone; typically this requires interaction with system libraries. Another option is to use an external command line tool like sox to record and pipe audio into the client.
The examples below use the latter approach by using the rec command provided with sox to record and stream the audio.

Python
Go

#!/usr/bin/env python3

# This example assumes sox is installed on the system and is available
# in the system's PATH variable. Instead of opening a regular file from
# disk, we open a subprocess that executes sox's rec command to record
# audio from the system's default microphone.

import subprocess
import grpc
import cobaltspeech.voicebio.v1.voicebio_pb2_grpc as stub
import cobaltspeech.voicebio.v1.voicebio_pb2 as voicebio

serverAddress = "localhost:2727"

# Using a channel without TLS enabled.
channel = grpc.insecure_channel(serverAddress)
client = stub.VoiceBioServiceStub(channel)

# Get server version.
versionResp = client.Version(voicebio.VersionRequest())
print(versionResp)

# Get list of models on the server.
modelResp = client.ListModels(voicebio.ListModelsRequest())

print("Models:")
for model in modelResp.models:
    print(model)

# Select a model ID from the list above. Going with the first model
# in this example.
m = modelResp.models[0]
modelID = m.id

# Loading reference voiceprints.
voiceprints = []
for p in ["user1.bin", "user2.bin", "user3.bin"]:
    with open(p, 'r') as f:
        voiceprints.append(voicebio.Voiceprint(data=f.read().strip()))

# Setting audio format to be raw 16-bit signed little endian audio samples
# recorded at the sample rate expected by the model.
cfg = voicebio.IdentificationConfig(
    model_id=modelID,
    voiceprints=voiceprints,
    audio_format=voicebio.AudioFormat(
      audio_format_raw=voicebio.AudioFormatRAW(
        encoding="AUDIO_ENCODING_SIGNED",
        bit_depth=16,
        byte_order="BYTE_ORDER_LITTLE_ENDIAN",
        sample_rate=m.attributes.sample_rate,
        channels=1,
      )
    ),
)

# Open microphone stream using sox's rec command and record
# audio using the config specified above for *10 seconds*.
maxDuration = 10
cmd = f"rec -t raw -r {m.attributes.sample_rate} -e signed -b 16 -L -c 1 - trim 0 {maxDuration}"
mic = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
audio = mic.stdout

try:
  _ = audio.read(1024) # Trying to read some bytes as sanity check.
except Exception as err:
    print(f"[ERROR] failed to read audio from mic stream: {err}")

print(f"\n[INFO] recording {maxDuration} seconds of audio microphone ... \n")

# The first request to the server should only contain the
# recognition configuration. Subsequent requests should contain
# audio bytes. We can write a simple generator to do this.
def stream(cfg, audio, bufferSize=1024):
    yield voicebio.StreamingIdentifyRequest(config=cfg)

    data = audio.read(bufferSize)
    while len(data) > 0:
        yield voicebio.StreamingIdentifyRequest(audio=voicebio.Audio(data=data))
        data = audio.read(bufferSize)

# Streaming audio to the server.
result = client.StreamingIdentify(stream(cfg, audio))

# Server returns the index of the voiceprint that matches the best, a similarity
# score for each voiceprint along with whether the score exceeded the server-configured
# threshold for being a match.
#
# If none of the voiceprints were a good match, the best match index will be negative.
matched = "❌ No Match found"
if result.best_match_index >= 0:
    best_score = result.voiceprint_comparison_results[result.best_match_index].similarity_score
    matched = f"✅ Match found: Index: {result.best_match_index}, Score: {best_score:1.3f}"

print(f"\nIdentification Result:\n")

print("Scores:")
for i, r in enumerate(result.voiceprint_comparison_results):
    print(f"Index: {i}, Score: {r.similarity_score:1.3f}, IsMatch: {r.is_match}")

print(f"\n{matched}")

audio.close()
mic.kill()

package main

import (
	"context"
	"errors"
	"fmt"
	"io"
	"os"
	"os/exec"
	"strings"

	"golang.org/x/sync/errgroup"
	"google.golang.org/grpc"
	"google.golang.org/grpc/credentials/insecure"

	voicebio "github.com/cobaltspeech/go-genproto/cobaltspeech/voicebio/v1"
)

func main() {
	const (
		serverAddress = "localhost:2727"
	)

	ctx, cancel := context.WithCancel(context.Background())
	defer cancel()

	opts := []grpc.DialOption{
		grpc.WithTransportCredentials(insecure.NewCredentials()), // Using a channel without TLS enabled.
		grpc.WithBlock(),
		grpc.WithReturnConnectionError(),
		grpc.FailOnNonTempDialError(true),
	}

	conn, err := grpc.DialContext(ctx, serverAddress, opts...)
	if err != nil {
		fmt.Printf("failed to dial gRPC connection: %v\n", err)
		os.Exit(1)
	}

	client := voicebio.NewVoiceBioServiceClient(conn)

	// Get server version.
	versionResp, err := client.Version(ctx, &voicebio.VersionRequest{})
	if err != nil {
		fmt.Printf("failed to get server version: %v\n", err)
		os.Exit(1)
	}

	fmt.Printf("%v\n", versionResp)

	// Get list model of models on the server.
	modelResp, err := client.ListModels(ctx, &voicebio.ListModelsRequest{})
	if err != nil {
		fmt.Printf("failed to get model list: %v\n", err)
		os.Exit(1)
	}

	fmt.Println("Models:")
	for _, m := range modelResp.Models {
		fmt.Println(m)
	}
	fmt.Println()

	// Selecting first model.
	m := modelResp.Models[0]

	// Reading voiceprint data.
	voiceprints := make([]*voicebio.Voiceprint, 0)

	for i, p := range []string{"user1.bin", "user2.bin", "user3.bin"} {
		data, err := os.ReadFile(p)
		if err != nil {
			fmt.Printf("\nfailed to read voiceprint[%d] data: %v\n", i, err)
			os.Exit(1)
		}

		voiceprints = append(voiceprints, &voicebio.Voiceprint{Data: string(data)})
	}

	// Setting audio format to be raw 16-bit signed little endian audio samples
	// recorded at the sample rate expected by the model.
	cfg := &voicebio.IdentificationConfig{
		ModelId:     m.Id,
		Voiceprints: voiceprints,
		AudioFormat: &voicebio.AudioFormat{AudioFormat: &voicebio.AudioFormat_AudioFormatRaw{
			AudioFormatRaw: &voicebio.AudioFormatRAW{
				Encoding:   voicebio.AudioEncoding_AUDIO_ENCODING_SIGNED,
				SampleRate: m.Attributes.SampleRate,
				BitDepth:   16,
				ByteOrder:  voicebio.ByteOrder_BYTE_ORDER_LITTLE_ENDIAN,
				Channels:   1,
			},
		},
		},
	}

	// Open microphone stream using sox's rec command and record
	// audio using the config specified above for *10 seconds*.
	maxDuration := 10
	args := fmt.Sprintf("-t raw -r %d -e signed -b 16 -L -c 1 - trim 0 %d", m.Attributes.SampleRate, maxDuration)
	cmd := exec.CommandContext(ctx, "rec", strings.Fields(args)...)
	cmd.Stderr = os.Stderr

	audio, err := cmd.StdoutPipe()
	if err != nil {
		fmt.Printf("failed to open microphone stream: %v\n", err)
		os.Exit(1)
	}

	// Starting routines to record from microphone and stream to server
	// using an errgroup.Group that returns if either one encounters an error.
	eg, ctx := errgroup.WithContext(ctx)

	eg.Go(func() error {
		fmt.Printf("\n[INFO] recording %d seconds from microphone \n", maxDuration)

		if err := cmd.Run(); err != nil {
			return fmt.Errorf("record from microphone: %w", err)
		}

		return nil
	})

	// Starting identification.
	result, err := StreamingIdentify(ctx, client, cfg, audio)
	if err != nil {
		fmt.Printf("failed to run streaming identification: %v\n", err)
		os.Exit(1)
	}

	// Server returns the index of the voiceprint that matches the best, a similarity
	// score for each voiceprint along with whether the score exceeded the server-configured
	// threshold for being a match.
	//
	// If none of the voiceprints were a good match, the best match index will be negative.
	matched := "❌ No Match found"
	if result.BestMatchIndex >= 0 {
		bestScore := result.VoiceprintComparisonResults[result.BestMatchIndex].SimilarityScore
		matched = fmt.Sprintf("✅ Match found: Index: %d, Score: %1.3f", result.BestMatchIndex, bestScore)
	}

	fmt.Printf("\nIdentification Result:\n")

	fmt.Printf("Scores:\n")
	for i, r := range result.VoiceprintComparisonResults {
		fmt.Printf("Index: %d, Score: %1.3f, IsMatch: %v\n", i, r.SimilarityScore, r.IsMatch)
	}

	fmt.Printf("\n%s\n", matched)
}

// StreamingIdentify wraps the streaming API for performing speaker identification
// using the given cfg.
//
// Data is read from the given audio reader into a buffer and streamed to VoiceBio
// server. The default buffer size may be overridden using Options when creating
// the Client.
//
// If any error occurs while reading the audio or sending it to the server, this
// method will immediately exit, returning that error.
func StreamingIdentify(
	ctx context.Context,
	client voicebio.VoiceBioServiceClient,
	cfg *voicebio.IdentificationConfig,
	audio io.Reader,
) (*voicebio.StreamingIdentifyResponse, error) {
	const (
		streamingBufSize = 1024
	)

	// Creating stream.
	stream, err := client.StreamingIdentify(ctx)
	if err != nil {
		return nil, err
	}

	// Sending audio.
	if err := sendAudio(stream, cfg, audio, streamingBufSize); err != nil && !errors.Is(err, io.EOF) {
		// if sendAudio encountered io.EOF, it's only a
		// notification that the stream has closed.  The actual
		// status will be obtained in the CloseAndRecv call. We
		// therefore return on non-EOF errors here.
		return nil, err
	}

	// Returning result.
	return stream.CloseAndRecv()
}

// sendAudio sends the config and audio to a stream.
func sendAudio(
	stream voicebio.VoiceBioService_StreamingIdentifyClient,
	cfg *voicebio.IdentificationConfig,
	audio io.Reader,
	bufSize uint32,
) error {
	// The first message needs to be a config message, and all subsequent
	// messages must be audio messages.

	// Send the config.
	if err := stream.Send(&voicebio.StreamingIdentifyRequest{
		Request: &voicebio.StreamingIdentifyRequest_Config{Config: cfg},
	}); err != nil {
		// if this failed, we don't need to CloseSend
		return err
	}

	// Stream the audio.
	buf := make([]byte, bufSize)
	for {
		n, err := audio.Read(buf)
		if n > 0 {
			if err2 := stream.Send(&voicebio.StreamingIdentifyRequest{
				Request: &voicebio.StreamingIdentifyRequest_Audio{
					Audio: &voicebio.Audio{Data: buf[:n]},
				},
			}); err2 != nil {
				// if we couldn't Send, the stream has
				// encountered an error and we don't need to
				// CloseSend.
				return err2
			}
		}

		if err != nil {
			// err could be io.EOF, or some other error reading from
			// audio.  In any case, we need to CloseSend, send the
			// appropriate error to errCh and return from the function
			if err2 := stream.CloseSend(); err2 != nil {
				return err2
			}

			if err != io.EOF {
				return err
			}

			return nil
		}
	}
}