modules/text_renderer/sapi.cpp

   1 /*****************************************************************************
   2  * sapi.cpp: Simple text to Speech renderer for Windows, based on SAPI
   3  *****************************************************************************
   4  * Copyright (c) 2015 Moti Zilberman
   5  *
   6  * Authors: Moti Zilberman
   7  *          Jean-Baptiste Kempf
   8  *
   9  * The MIT License (MIT)
  10  *
  11  * Permission is hereby granted, free of charge, to any person obtaining a copy
  12  * of this software and associated documentation files (the "Software"), to deal
  13  * in the Software without restriction, including without limitation the rights
  14  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  15  * copies of the Software, and to permit persons to whom the Software is
  16  * furnished to do so, subject to the following conditions:
  17  *
  18  * The above copyright notice and this permission notice shall be included in all
  19  * copies or substantial portions of the Software.
  20  *
  21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  22  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  24  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  26  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  27  * SOFTWARE.
  28  *****************************************************************************/
  29
  30 #ifdef HAVE_CONFIG_H
  31 # include "config.h"
  32 #endif
  33
  34 /* VLC core API headers */
  35 #include <vlc_common.h>
  36 #include <vlc_plugin.h>
  37 #include <vlc_filter.h>
  38 #include <vlc_charset.h>
  39 #include <vlc_subpicture.h>
  40
  41 #define INITGUID
  42
  43 #include <windows.h>
  44 #include <sapi.h>
  45 #include <sphelper.h>
  46
  47 static int Create (vlc_object_t *);
  48 static void Destroy(vlc_object_t *);
  49 static int RenderText(filter_t *,
  50                       subpicture_region_t *,
  51                       subpicture_region_t *,
  52                       const vlc_fourcc_t *);
  53
  54 vlc_module_begin ()
  55  set_description(N_("Speech synthesis for Windows"))
  56
  57  set_category(CAT_VIDEO)
  58  set_subcategory(SUBCAT_VIDEO_SUBPIC)
  59
  60  set_capability("text renderer", 0)
  61  set_callbacks(Create, Destroy)
  62  add_integer("sapi-voice", -1, "Voice Index", "Voice index", false)
  63 vlc_module_end ()
  64
  65 struct filter_sys_t
  66 {
  67     ISpVoice* cpVoice;
  68     char* lastString;
  69 };
  70
  71 /* MTA functions */
  72 static int TryEnterMTA(vlc_object_t *obj)
  73 {
  74     HRESULT hr = CoInitializeEx(NULL, COINIT_MULTITHREADED);
  75     if (unlikely(FAILED(hr)))
  76     {
  77         msg_Err (obj, "cannot initialize COM (error 0x%lx)", hr);
  78         return -1;
  79     }
  80     return 0;
  81 }
  82 #define TryEnterMTA(o) TryEnterMTA(VLC_OBJECT(o))
  83
  84 static void EnterMTA(void)
  85 {
  86     HRESULT hr = CoInitializeEx(NULL, COINIT_MULTITHREADED);
  87     if (unlikely(FAILED(hr)))
  88         abort();
  89 }
  90
  91 static void LeaveMTA(void)
  92 {
  93     CoUninitialize();
  94 }
  95
  96 static int Create (vlc_object_t *p_this)
  97 {
  98     filter_t *p_filter = (filter_t *)p_this;
  99     filter_sys_t *p_sys;
 100     HRESULT hr;
 101
 102     if (TryEnterMTA(p_this))
 103         return VLC_EGENERIC;
 104
 105     p_filter->p_sys = p_sys = (filter_sys_t*) malloc(sizeof(filter_sys_t));
 106     if (!p_sys)
 107         goto error;
 108
 109     p_sys->cpVoice = NULL;
 110     p_sys->lastString = NULL;
 111
 112     hr = CoCreateInstance(CLSID_SpVoice, NULL, CLSCTX_INPROC_SERVER, IID_ISpVoice, (void**) &p_sys->cpVoice);
 113     if (SUCCEEDED(hr)) {
 114         ISpObjectToken*        cpVoiceToken = NULL;
 115         IEnumSpObjectTokens*   cpEnum = NULL;
 116         ULONG ulCount = 0;
 117
 118         hr = SpEnumTokens(SPCAT_VOICES, NULL, NULL, &cpEnum);
 119         if (SUCCEEDED(hr))
 120         {
 121             // Get the number of voices.
 122             hr = cpEnum->GetCount(&ulCount);
 123             if (SUCCEEDED (hr))
 124             {
 125                 int voiceIndex = var_InheritInteger(p_this, "sapi-voice");
 126                 if (voiceIndex > -1)
 127                 {
 128                     if ((unsigned)voiceIndex < ulCount) {
 129                         hr = cpEnum->Item(voiceIndex, &cpVoiceToken);
 130                         if (SUCCEEDED(hr)) {
 131                             hr = p_sys->cpVoice->SetVoice(cpVoiceToken);
 132                             if (SUCCEEDED(hr)) {
 133                                 msg_Dbg(p_this, "Selected voice %d", voiceIndex);
 134                             }
 135                             else {
 136                                 msg_Err(p_this, "Failed to set voice %d", voiceIndex);
 137                             }
 138                             cpVoiceToken->Release();
 139                             cpVoiceToken = NULL;
 140                         }
 141                     }
 142                     else
 143                         msg_Err(p_this, "Voice index exceeds available count");
 144                 }
 145             }
 146             cpEnum->Release();
 147
 148             /* Set Output */
 149             hr = p_sys->cpVoice->SetOutput(NULL, TRUE);
 150         }
 151     }
 152     else
 153     {
 154         msg_Err(p_filter, "Could not create SpVoice");
 155         goto error;
 156     }
 157
 158     LeaveMTA();
 159
 160     p_filter->pf_render = RenderText;
 161
 162     return VLC_SUCCESS;
 163
 164 error:
 165     LeaveMTA();
 166     free(p_sys);
 167     return VLC_EGENERIC;
 168 }
 169
 170 static void Destroy(vlc_object_t *p_this)
 171 {
 172     filter_t *p_filter = (filter_t *)p_this;
 173     filter_sys_t *p_sys = reinterpret_cast<filter_sys_t *>( p_filter->p_sys );
 174
 175     if (p_sys->cpVoice)
 176         p_sys->cpVoice->Release();
 177
 178     free(p_sys->lastString);
 179     free(p_sys);
 180 }
 181
 182 static int RenderText(filter_t *p_filter,
 183         subpicture_region_t *,
 184         subpicture_region_t *p_region_in,
 185         const vlc_fourcc_t *)
 186 {
 187     filter_sys_t *p_sys = reinterpret_cast<filter_sys_t *>( p_filter->p_sys );
 188     text_segment_t *p_segment = p_region_in->p_text;
 189
 190     if (!p_segment)
 191         return VLC_EGENERIC;
 192
 193     for (const text_segment_t *s = p_segment; s != NULL; s = s->p_next ) {
 194         if (!s->psz_text)
 195             continue;
 196
 197         if (strlen(s->psz_text) == 0)
 198             continue;
 199
 200         if (p_sys->lastString && !strcmp(p_sys->lastString, s->psz_text))
 201             continue;
 202
 203         if (!strcmp(s->psz_text, "\n"))
 204             continue;
 205
 206         /* */
 207         free(p_sys->lastString);
 208         p_sys->lastString = strdup(s->psz_text);
 209
 210         /* */
 211         if (p_sys->lastString) {
 212             msg_Dbg(p_filter, "Speaking '%s'", s->psz_text);
 213
 214             EnterMTA();
 215             wchar_t* wideText = ToWide(s->psz_text);
 216             HRESULT hr = p_sys->cpVoice->Speak(wideText, SPF_ASYNC, NULL);
 217             free(wideText);
 218             if (!SUCCEEDED(hr)) {
 219                 msg_Err(p_filter, "Speak() error");
 220             }
 221             LeaveMTA();
 222         }
 223     }
 224
 225     return VLC_SUCCESS;
 226 }