Source code for speechkit._synthesis

import requests

from speechkit.exceptions import RequestError


[docs]class SpeechSynthesis:
    """Generates speech from received text."""

    def __init__(self, session):
        """
        Initialize :py:class:`speechkit.SpeechSynthesis`

        :param speechkit.Session session: Session instance for auth
        """
        self._headers = session.header
        self._folder_id = session.folder_id

    def _synthesize_stream(self, **kwargs):
        """Creates request to generate speech from text"""

        url = 'https://tts.api.cloud.yandex.net/speech/v1/tts:synthesize'
        params = {'folderId': self._folder_id} if self._folder_id else {}
        params.update(kwargs)
        answer = requests.post(url, headers=self._headers, data=params, stream=True)

        if not answer.ok:
            raise RequestError(answer.json())

        answer.raw.decode_content = True
        return answer.content

[docs]    def synthesize(self, file_path, **kwargs):
        """
        Generates speech from received text and saves it to file

        :type file_path: string
        :param file_path: The path to file where store data

        :type text: string
        :param text: UTF-8 encoded text to be converted to speech.
            You can only use one `text` and `ssml` field.
            For homographs, place a `+` before the stressed vowel.
            For example, `contr+ol` or `def+ect`.
            To indicate a pause between words, use `-`.
            Maximum string length: 5000 characters.

        :type ssml: string
        :param ssml: Text in SSML format to be converted into speech.
            You can only use one text and ssml fields.

        :type lang: string
        :param lang: Language.
            Acceptable values:

            * `ru-RU` (default) — Russian.

            * `en-US` — English.

            * `tr-TR` — Turkish.

        :type voice: string
        :param voice: Preferred speech synthesis voice from the list.
            Default value: `oksana`.

        :type speed: string
        :param speed: Rate (speed) of synthesized speech.
            The rate of speech is set as a decimal number in the range from 0.1 to 3.0. Where:

            * `3.0` — Fastest rate.

            * `1.0` (default) — Average human speech rate.

            * `0.1` — Slowest speech rate.

        :type format: string
        :param format: The format of the synthesized audio. Acceptable values:

            * `lpcm` — Audio file is synthesized in LPCM format with no WAV _header. Audio properties:

                * Sampling — 8, 16, or 48 kHz, depending on the value of the `sample_rate_hertz` parameter.

                * Bit depth — 16-bit.

                * Byte order — Reversed (little-endian).

                * Audio data is stored as signed integers.

            * `oggopus` (default) — Data in the audio file is encoded using the OPUS audio codec and compressed using
                the OGG container format (OggOpus).

        :type sampleRateHertz: string
        :param sample_rate_hertz: The sampling frequency of the synthesized audio. Used
            if format is set to lpcm. Acceptable values: * `48000` (default): Sampling rate of 48 kHz. * `16000`:
            Sampling rate of 16 kHz. * `8000`: Sampling rate of 8 kHz.

        :type folderId: string
        :param folderId: ID of the folder that you have access to.
            Required for authorization with a user account (see the UserAccount resource).
            Don't specify this field if you make a request on behalf of a service account.
        """

        if 'text' in kwargs and len(kwargs.get('text', '')) > 5000:
            raise ValueError("Text must be less than 5000 characters")

        with open(file_path, "wb") as f:
            audio_data = self._synthesize_stream(**kwargs)
            f.write(audio_data)

[docs]    def synthesize_stream(self, **kwargs):
        """
        Generates speech from received text and return :py:meth:`io.BytesIO` object with data.

        :type text: string
        :param text: UTF-8 encoded text to be converted to speech.
            You can only use one `text` and `ssml` field.
            For homographs, place a `+` before the stressed vowel.
            For example, `contr+ol` or `def+ect`.
            To indicate a pause between words, use `-`.
            Maximum string length: 5000 characters.

        :type ssml: string
        :param ssml: Text in SSML format to be converted into speech.
            You can only use one text and ssml fields.

        :type lang: string
        :param lang: Language.
            Acceptable values:

            * `ru-RU` (default) — Russian.

            * `en-US` — English.

            * `tr-TR` — Turkish.

        :type voice: string
        :param voice: Preferred speech synthesis voice from the list.
            Default value: `oksana`.

        :type speed: string
        :param speed: Rate (speed) of synthesized speech.
            The rate of speech is set as a decimal number in the range from 0.1 to 3.0. Where:

            * `3.0` — Fastest rate.

            * `1.0` (default) — Average human speech rate.

            * `0.1` — Slowest speech rate.

        :type format: string
        :param format: The format of the synthesized audio. Acceptable values:

            - `lpcm` — Audio file is synthesized in LPCM format with no WAV _header. Audio properties:

                * Sampling — 8, 16, or 48 kHz, depending on the value of the `sample_rate_hertz` parameter.

                * Bit depth — 16-bit.

                * Byte order — Reversed (little-endian).

                * Audio data is stored as signed integers.

            - `oggopus` (default) — Data in the audio file is encoded using the OPUS audio codec and compressed using
                the OGG container format (OggOpus).

        :type sampleRateHertz: string
        :param sampleRateHertz: The sampling frequency of the synthesized audio.
            Used if format is set to lpcm. Acceptable values:

            * `48000` (default): Sampling rate of 48 kHz.

            * `16000`: Sampling rate of 16 kHz.

            * `8000`: Sampling rate of 8 kHz.

        :type folderId: string
        :param folderId: ID of the folder that you have access to.
            Required for authorization with a user account (see the UserAccount resource).
            Don't specify this field if you make a request on behalf of a service account.
        """
        if 'text' in kwargs and len(kwargs.get('text', '')) > 5000:
            raise ValueError("Text must be less than 5000 characters")

        return self._synthesize_stream(**kwargs)