1 /*****************************************************************************
2 * sapi.cpp: Simple text to Speech renderer for Windows, based on SAPI
3 *****************************************************************************
4 * Copyright (c) 2015 Moti Zilberman
6 * Authors: Moti Zilberman
9 * The MIT License (MIT)
11 * Permission is hereby granted, free of charge, to any person obtaining a copy
12 * of this software and associated documentation files (the "Software"), to deal
13 * in the Software without restriction, including without limitation the rights
14 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15 * copies of the Software, and to permit persons to whom the Software is
16 * furnished to do so, subject to the following conditions:
18 * The above copyright notice and this permission notice shall be included in all
19 * copies or substantial portions of the Software.
21 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 *****************************************************************************/
34 /* VLC core API headers */
35 #include <vlc_common.h>
36 #include <vlc_plugin.h>
37 #include <vlc_filter.h>
38 #include <vlc_charset.h>
39 #include <vlc_subpicture.h>
47 static int Create (vlc_object_t
*);
48 static void Destroy(vlc_object_t
*);
49 static int RenderText(filter_t
*,
50 subpicture_region_t
*,
51 subpicture_region_t
*,
52 const vlc_fourcc_t
*);
55 set_description(N_("Speech synthesis for Windows"))
57 set_category(CAT_VIDEO
)
58 set_subcategory(SUBCAT_VIDEO_SUBPIC
)
60 set_capability("text renderer", 0)
61 set_callbacks(Create
, Destroy
)
62 add_integer("sapi-voice", -1, "Voice Index", "Voice index", false)
72 static int TryEnterMTA(vlc_object_t
*obj
)
74 HRESULT hr
= CoInitializeEx(NULL
, COINIT_MULTITHREADED
);
75 if (unlikely(FAILED(hr
)))
77 msg_Err (obj
, "cannot initialize COM (error 0x%lx)", hr
);
82 #define TryEnterMTA(o) TryEnterMTA(VLC_OBJECT(o))
84 static void EnterMTA(void)
86 HRESULT hr
= CoInitializeEx(NULL
, COINIT_MULTITHREADED
);
87 if (unlikely(FAILED(hr
)))
91 static void LeaveMTA(void)
96 static int Create (vlc_object_t
*p_this
)
98 filter_t
*p_filter
= (filter_t
*)p_this
;
102 if (TryEnterMTA(p_this
))
105 p_filter
->p_sys
= p_sys
= (filter_sys_t
*) malloc(sizeof(filter_sys_t
));
109 p_sys
->cpVoice
= NULL
;
110 p_sys
->lastString
= NULL
;
112 hr
= CoCreateInstance(CLSID_SpVoice
, NULL
, CLSCTX_INPROC_SERVER
, IID_ISpVoice
, (void**) &p_sys
->cpVoice
);
114 ISpObjectToken
* cpVoiceToken
= NULL
;
115 IEnumSpObjectTokens
* cpEnum
= NULL
;
118 hr
= SpEnumTokens(SPCAT_VOICES
, NULL
, NULL
, &cpEnum
);
121 // Get the number of voices.
122 hr
= cpEnum
->GetCount(&ulCount
);
125 int voiceIndex
= var_InheritInteger(p_this
, "sapi-voice");
128 if ((unsigned)voiceIndex
< ulCount
) {
129 hr
= cpEnum
->Item(voiceIndex
, &cpVoiceToken
);
131 hr
= p_sys
->cpVoice
->SetVoice(cpVoiceToken
);
133 msg_Dbg(p_this
, "Selected voice %d", voiceIndex
);
136 msg_Err(p_this
, "Failed to set voice %d", voiceIndex
);
138 cpVoiceToken
->Release();
143 msg_Err(p_this
, "Voice index exceeds available count");
149 hr
= p_sys
->cpVoice
->SetOutput(NULL
, TRUE
);
154 msg_Err(p_filter
, "Could not create SpVoice");
160 p_filter
->pf_render
= RenderText
;
170 static void Destroy(vlc_object_t
*p_this
)
172 filter_t
*p_filter
= (filter_t
*)p_this
;
173 filter_sys_t
*p_sys
= reinterpret_cast<filter_sys_t
*>( p_filter
->p_sys
);
176 p_sys
->cpVoice
->Release();
178 free(p_sys
->lastString
);
182 static int RenderText(filter_t
*p_filter
,
183 subpicture_region_t
*,
184 subpicture_region_t
*p_region_in
,
185 const vlc_fourcc_t
*)
187 filter_sys_t
*p_sys
= reinterpret_cast<filter_sys_t
*>( p_filter
->p_sys
);
188 text_segment_t
*p_segment
= p_region_in
->p_text
;
193 for (const text_segment_t
*s
= p_segment
; s
!= NULL
; s
= s
->p_next
) {
197 if (strlen(s
->psz_text
) == 0)
200 if (p_sys
->lastString
&& !strcmp(p_sys
->lastString
, s
->psz_text
))
203 if (!strcmp(s
->psz_text
, "\n"))
207 free(p_sys
->lastString
);
208 p_sys
->lastString
= strdup(s
->psz_text
);
211 if (p_sys
->lastString
) {
212 msg_Dbg(p_filter
, "Speaking '%s'", s
->psz_text
);
215 wchar_t* wideText
= ToWide(s
->psz_text
);
216 HRESULT hr
= p_sys
->cpVoice
->Speak(wideText
, SPF_ASYNC
, NULL
);
218 if (!SUCCEEDED(hr
)) {
219 msg_Err(p_filter
, "Speak() error");