Streaming Synthesis

Describes how to submit text VoiceGen server for streaming synthesis.

The following example shows how to synthesize streaming audio from text using VoiceGen’s StreamingSynthesize request. The audio can be played back as it is being streamed as well as being saved to a file or buffer.

Synthesizing streaming audio and writing to a file

We support streaming several headered file formats including WAV, MP3, FLAC etc. as well streaming raw audio samples. For more details, please see the protocol buffer specification here.
The examples below show how to submit a chunk of text and receive streaming audio which is written to a file. We will query the server for available models and use the first model for synthesis.

Python
Go

import grpc
import cobaltspeech.voicegen.v1.voicegen_pb2_grpc as stub
import cobaltspeech.voicegen.v1.voicegen_pb2 as voicegen

serverAddress = "localhost:2727"

# Using a channel without TLS enabled.
channel = grpc.insecure_channel(serverAddress)
client = stub.VoiceGenServiceStub(channel)

# Get server version.
versionResp = client.Version(voicegen.VersionRequest())
print(versionResp)

# Get list of models on the server.
modelResp = client.ListModels(voicegen.ListModelsRequest())

# A model may be a single-speaker model or a multi-speaker model.
# The speakers available for a model will be printed in the model
# attributes below.
print("Models:")
for model in modelResp.models:
    print(model)

# Going with the first model in this example. Also using the first
# speaker available in the model (in case of single-speaker models,
# it is the *only* speaker).
model = modelResp.models[0]
spk = model.attributes.speakers[0]

# Set the synthesis config.
# 
# - We could set speaker_id to None to let the server use the default
#   speaker configured on the server side.
#
# - We are specifying the output audio format to be WAV with 16 bit signed
#   samples, at the model's native sampling rate.
cfg = voicegen.SynthesisConfig(
    model_id=model.id,
    speaker_id=spk.id,
    audio_format=voicegen.AudioFormat(
      codec=voicegen.AUDIO_CODEC_WAV,
	  sample_rate=model.attributes.native_audio_format.sample_rate,
      encoding=voicegen.AUDIO_ENCODING_SIGNED,
      bit_depth=16,
	  channels=1,
      byte_order=voicegen.BYTE_ORDER_LITTLE_ENDIAN,
    ),
)

# Specifying text to synthesize, which could be a single line or multiple paragraphs.
# VoiceGen breaks up the text based on its sentence segmentation algorithm as well as
# any line breaks specified in the input text. We intentionally put line breaks here
# to make it look a bit nicer in the code, which are replaced with spaces.
text = voicegen.SynthesisText(text='''
The world's first 3D printed rocket launched successfully on Wednesday, marking
a step forward for the California company behind the innovative spacecraft,
though it failed to reach orbit.

The successful launch came on the third attempt. It had originally been
scheduled to launch on March 8 but was postponed at the last minute because of
propellant temperature issues. A second attempt on March 11 was scrubbed because of
fuel pressure problems.

Had Terran 1 reached low Earth orbit, it would have been the first privately
funded vehicle using methane fuel to do so on its first try, according to
Relativity.
'''.replace("\n", " "))

# Submitting request to the server and writing streamed audio chunks to file.
print("Synthesizing ...")
with open("output.wav", 'wb') as f:
    for resp in client.StreamingSynthesize(voicegen.StreamingSynthesizeRequest(config=cfg, text=text)):
	    f.write(resp.audio.data)

package main

import (
	"context"
	"errors"
	"fmt"
	"io"
	"os"
	"strings"

	"google.golang.org/grpc"
	"google.golang.org/grpc/credentials/insecure"

	voicegenpb "github.com/cobaltspeech/go-genproto/cobaltspeech/voicegen/v1"
)

func main() {
	const (
		serverAddress = "localhost:2727"
	)

	ctx, cancel := context.WithCancel(context.Background())
	defer cancel()

	opts := []grpc.DialOption{
		grpc.WithTransportCredentials(insecure.NewCredentials()), // Using a channel without TLS enabled.
		grpc.WithBlock(),
		grpc.WithReturnConnectionError(),
		grpc.FailOnNonTempDialError(true),
	}

	conn, err := grpc.DialContext(ctx, serverAddress, opts...)
	if err != nil {
		fmt.Printf("failed to dial gRPC connection: %v\n", err)
		os.Exit(1)
	}

	client := voicegenpb.NewVoiceGenServiceClient(conn)

	// Get server version.
	versionResp, err := client.Version(ctx, &voicegenpb.VersionRequest{})
	if err != nil {
		fmt.Printf("failed to get server version: %v\n", err)
		os.Exit(1)
	}

	fmt.Printf("%v\n", versionResp)

	// Get list model of models on the server.
	modelResp, err := client.ListModels(ctx, &voicegenpb.ListModelsRequest{})
	if err != nil {
		fmt.Printf("failed to get model list: %v\n", err)
		os.Exit(1)
	}

	// A model may be a single-speaker model or a multi-speaker model.
	// The speakers available for a model will be printed in the model
	// attributes below.
	fmt.Println("Models:")
	for _, m := range modelResp.Models {
		fmt.Println(m)
	}
	fmt.Println()

	// Going with the first model in this example. Also using the first
	// speaker available in the model (in case of single-speaker models,
	// it is the *only* speaker).
	model := modelResp.Models[0]
	spk := model.Attributes.Speakers[0]

	// Set the synthesis config.
	//
	//   - We could set speaker_id to None to let the server use the default
	//     speaker configured on the server side.
	//
	//   - We are specifying the output audio format to be WAV with 16 bit signed
	//     samples, at the model's native sampling rate.
	cfg := &voicegenpb.SynthesisConfig{
		ModelId:   model.Id,
		SpeakerId: spk.Id,
		AudioFormat: &voicegenpb.AudioFormat{
			Codec:      voicegenpb.AudioCodec_AUDIO_CODEC_WAV,
			SampleRate: model.Attributes.NativeAudioFormat.SampleRate,
			Encoding:   voicegenpb.AudioEncoding_AUDIO_ENCODING_SIGNED,
			BitDepth:   16,
			Channels:   1,
			ByteOrder:  voicegenpb.ByteOrder_BYTE_ORDER_LITTLE_ENDIAN,
		},
	}

	// Specifying text to synthesize, which could be a single line or multiple paragraphs.
	// VoiceGen breaks up the text based on its sentence segmentation algorithm as well as
	// any line breaks specified in the input text. We intentionally put line breaks here
	// to make it look a bit nicer in the code, which are replaced with spaces.
	text := &voicegenpb.SynthesisText{Text: strings.ReplaceAll(`
The world's first 3D printed rocket launched successfully on Wednesday, marking
a step forward for the California company behind the innovative spacecraft,
though it failed to reach orbit.

The successful launch came on the third attempt. It had originally been
scheduled to launch on March 8 but was postponed at the last minute because of
propellant temperature issues. A second attempt on March 11 was scrubbed because of
fuel pressure problems.

Had Terran 1 reached low Earth orbit, it would have been the first privately
funded vehicle using methane fuel to do so on its first try, according to
Relativity.
`, "\n", " ")}

	// Submitting request to the server and writing streamed audio chunks to file.
	fmt.Println("Synthesizing ...")
	stream, err := client.StreamingSynthesize(context.Background(), &voicegenpb.StreamingSynthesizeRequest{Config: cfg, Text: text})
	if err != nil {
		fmt.Printf("failed to start synthesis stream: %v\n", err)
		os.Exit(1)
	}

	// Opening output audio file.
	outF, err := os.Create("output.wav")
	if err != nil {
		fmt.Printf("failed to open output audio file: %v\n", err)
		os.Exit(1)
	}

	defer outF.Close()

	// Receiving audio and writing to file.
	for {
		resp, err := stream.Recv()
		if errors.Is(io.EOF, err) {
			return
		}

		if err != nil {
			fmt.Printf("error encountered while synthesizing: %v\n", err)
			os.Exit(1)
		}

		audio := resp.GetAudio()
		if audio == nil {
			fmt.Printf("error encountered while synthesizing: server returned nil audio")
			os.Exit(1)
		}

		outF.Write(audio.Data)
	}
}

Synthesizing streaming audio with live playback

The synthesized audio stream can be played back live instead of saving it to a file by writing the data to an appropriate interface that can do the playback; typically this requires interaction with system libraries. Another option is to pipe the audio out to an external command line tool like sox.
The examples below use the latter approach by using the play command provided with sox to play the synthesized audio stream live.

Python
Go

import subprocess
import grpc
import cobaltspeech.voicegen.v1.voicegen_pb2_grpc as stub
import cobaltspeech.voicegen.v1.voicegen_pb2 as voicegen

serverAddress = "localhost:2727"

# Using a channel without TLS enabled.
channel = grpc.insecure_channel(serverAddress)
client = stub.VoiceGenServiceStub(channel)

# Get server version.
versionResp = client.Version(voicegen.VersionRequest())
print(versionResp)

# Get list of models on the server.
modelResp = client.ListModels(voicegen.ListModelsRequest())

# A model may be a single-speaker model or a multi-speaker model.
# The speakers available for a model will be printed in the model
# attributes below.
print("Models:")
for model in modelResp.models:
    print(model)

# Going with the first model in this example. Also using the first
# speaker available in the model (in case of single-speaker models,
# it is the *only* speaker).
model = modelResp.models[0]
spk = model.attributes.speakers[0]

# Set the synthesis config.
# 
# - We could set speaker_id to None to let the server use the default
#   speaker configured on the server side.
#
# - We are specifying the output audio format to be WAV with 16 bit signed
#   samples, at the model's native sampling rate.
cfg = voicegen.SynthesisConfig(
    model_id=model.id,
    speaker_id=spk.id,
    audio_format=voicegen.AudioFormat(
      codec=voicegen.AUDIO_CODEC_WAV,
	  sample_rate=model.attributes.native_audio_format.sample_rate,
      encoding=voicegen.AUDIO_ENCODING_SIGNED,
      bit_depth=16,
	  channels=1,
      byte_order=voicegen.BYTE_ORDER_LITTLE_ENDIAN,
    ),
)

# Specifying text to synthesize, which could be a single line or multiple paragraphs.
# VoiceGen breaks up the text based on its sentence segmentation algorithm as well as
# any line breaks specified in the input text. We intentionally put line breaks here
# to make it look a bit nicer in the code, which are replaced with spaces.
text = voicegen.SynthesisText(text='''
The world's first 3D printed rocket launched successfully on Wednesday, marking
a step forward for the California company behind the innovative spacecraft,
though it failed to reach orbit.

The successful launch came on the third attempt. It had originally been
scheduled to launch on March 8 but was postponed at the last minute because of
propellant temperature issues. A second attempt on March 11 was scrubbed because of
fuel pressure problems.

Had Terran 1 reached low Earth orbit, it would have been the first privately
funded vehicle using methane fuel to do so on its first try, according to
Relativity.
'''.replace("\n", " "))

# Open playback stream using sox's play command as subprocess.
cmd = f"play -t wav -"
play = subprocess.Popen(cmd.split(), stdin=subprocess.PIPE)
out = play.stdin

# Submitting request to the server and writing streamed audio chunks to playback stream.
print("Synthesizing ...")
for resp in client.StreamingSynthesize(voicegen.StreamingSynthesizeRequest(config=cfg, text=text)):
    out.write(resp.audio.data)

out.close()
play.wait()
play.kill()

package main

import (
	"context"
	"errors"
	"fmt"
	"io"
	"os"
	"os/exec"
	"strings"

	"golang.org/x/sync/errgroup"
	"google.golang.org/grpc"
	"google.golang.org/grpc/credentials/insecure"

	voicegenpb "github.com/cobaltspeech/go-genproto/cobaltspeech/voicegen/v1"
)

func main() {
	const (
		serverAddress = "localhost:2727"
	)

	ctx, cancel := context.WithCancel(context.Background())
	defer cancel()

	opts := []grpc.DialOption{
		grpc.WithTransportCredentials(insecure.NewCredentials()), // Using a channel without TLS enabled.
		grpc.WithBlock(),
		grpc.WithReturnConnectionError(),
		grpc.FailOnNonTempDialError(true),
	}

	conn, err := grpc.DialContext(ctx, serverAddress, opts...)
	if err != nil {
		fmt.Printf("failed to dial gRPC connection: %v\n", err)
		os.Exit(1)
	}

	client := voicegenpb.NewVoiceGenServiceClient(conn)

	// Get server version.
	versionResp, err := client.Version(ctx, &voicegenpb.VersionRequest{})
	if err != nil {
		fmt.Printf("failed to get server version: %v\n", err)
		os.Exit(1)
	}

	fmt.Printf("%v\n", versionResp)

	// Get list model of models on the server.
	modelResp, err := client.ListModels(ctx, &voicegenpb.ListModelsRequest{})
	if err != nil {
		fmt.Printf("failed to get model list: %v\n", err)
		os.Exit(1)
	}

	// A model may be a single-speaker model or a multi-speaker model.
	// The speakers available for a model will be printed in the model
	// attributes below.
	fmt.Println("Models:")
	for _, m := range modelResp.Models {
		fmt.Println(m)
	}
	fmt.Println()

	// Going with the first model in this example. Also using the first
	// speaker available in the model (in case of single-speaker models,
	// it is the *only* speaker).
	model := modelResp.Models[0]
	spk := model.Attributes.Speakers[0]

	// Set the synthesis config.
	//
	//   - We could set speaker_id to None to let the server use the default
	//     speaker configured on the server side.
	//
	//   - We are specifying the output audio format to be WAV with 16 bit signed
	//     samples, at the model's native sampling rate.
	cfg := &voicegenpb.SynthesisConfig{
		ModelId:   model.Id,
		SpeakerId: spk.Id,
		AudioFormat: &voicegenpb.AudioFormat{
			Codec:      voicegenpb.AudioCodec_AUDIO_CODEC_WAV,
			SampleRate: model.Attributes.NativeAudioFormat.SampleRate,
			Encoding:   voicegenpb.AudioEncoding_AUDIO_ENCODING_SIGNED,
			BitDepth:   16,
			Channels:   1,
			ByteOrder:  voicegenpb.ByteOrder_BYTE_ORDER_LITTLE_ENDIAN,
		},
	}

	// Specifying text to synthesize, which could be a single line or multiple paragraphs.
	// VoiceGen breaks up the text based on its sentence segmentation algorithm as well as
	// any line breaks specified in the input text. We intentionally put line breaks here
	// to make it look a bit nicer in the code, which are replaced with spaces.
	text := &voicegenpb.SynthesisText{Text: strings.ReplaceAll(`
The world's first 3D printed rocket launched successfully on Wednesday, marking
a step forward for the California company behind the innovative spacecraft,
though it failed to reach orbit.

The successful launch came on the third attempt. It had originally been
scheduled to launch on March 8 but was postponed at the last minute because of
propellant temperature issues. A second attempt on March 11 was scrubbed because of
fuel pressure problems.

Had Terran 1 reached low Earth orbit, it would have been the first privately
funded vehicle using methane fuel to do so on its first try, according to
Relativity.
`, "\n", " ")}

	// Starting routines to receive audio from server and write to playback stream;
	// using an errgroup.Group that returns if either one encounters an error.
	eg, ctx := errgroup.WithContext(context.Background())

	// Submitting request to the server and writing streamed audio chunks to file.
	fmt.Println("Synthesizing ...")
	stream, err := client.StreamingSynthesize(ctx, &voicegenpb.StreamingSynthesizeRequest{Config: cfg, Text: text})
	if err != nil {
		fmt.Printf("failed to start synthesis stream: %v\n", err)
		os.Exit(1)
	}

	// Open playback stream using sox's play command as a subprocess.
	cmd := exec.CommandContext(ctx, "play", "-t", "wav", "-")
	cmd.Stderr = os.Stderr

	outW, err := cmd.StdinPipe()
	if err != nil {
		fmt.Printf("failed to open playback stream: %v\n", err)
		os.Exit(1)
	}

	eg.Go(func() error {
		if err := cmd.Run(); err != nil {
			return fmt.Errorf("error encountered in audio playback: %w", err)
		}

		return nil
	})

	eg.Go(func() error {
		defer outW.Close()

		// Receiving audio and writing to playback stream.
		for {
			resp, err := stream.Recv()
			if errors.Is(io.EOF, err) {
				return nil
			}

			if err != nil {
				return fmt.Errorf("error encountered while synthesizing: %w", err)
			}

			audio := resp.GetAudio()
			if audio == nil {
				return fmt.Errorf("error encountered while synthesizing: server returned nil audio")
			}

			outW.Write(audio.Data)
		}
	})

	if err := eg.Wait(); err != nil {
		fmt.Println(err)
		os.Exit(1)
	}
}