"Fossies" - the Fresh Open Source Software Archive 
1 /************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 2002 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* */
34 /* Author: Rob Clark (robert@cstr.ed.ac.uk) */
35 /* -------------------------------------------------------------------- */
36 /* Code to read APML format XML as utterances. */
37 /* */
38 /*************************************************************************/
39
40 #include <cstdlib>
41 #include <cstdio>
42 #include "EST_THash.h"
43 #include "EST_error.h"
44 #include "apml.h"
45 #include "rxp/XML_Parser.h"
46
47 static EST_Regex simpleIDRegex(".*#id(w\\([0-9]+\\))");
48 static EST_Regex rangeIDRegex(".*#id(w\\([0-9]+\\)).*id(w\\([0-9]+\\))");
49 static EST_Regex RXpunc("[\\.,\\?\\!\"]+");
50
51 class Parse_State
52 {
53 public:
54 int depth;
55 int maxid;
56 EST_Utterance *utt;
57 EST_Relation *tokens;
58 EST_Relation *perf;
59 EST_Relation *com;
60 EST_Relation *semstruct;
61 EST_Relation *emphasis;
62 EST_Relation *boundary;
63 EST_Relation *pause;
64 EST_Item *parent;
65 EST_Item *pending;
66 EST_Item *last_token;
67 };
68
69 class Apml_Parser_Class : public XML_Parser_Class
70 {
71 protected:
72 virtual void document_open(XML_Parser_Class &c,
73 XML_Parser &p,
74 void *data);
75 virtual void document_close(XML_Parser_Class &c,
76 XML_Parser &p,
77 void *data);
78
79 virtual void element_open(XML_Parser_Class &c,
80 XML_Parser &p,
81 void *data,
82 const char *name,
83 XML_Attribute_List &attributes);
84 virtual void element(XML_Parser_Class &c,
85 XML_Parser &p,
86 void *data,
87 const char *name,
88 XML_Attribute_List &attributes);
89 virtual void element_close(XML_Parser_Class &c,
90 XML_Parser &p,
91 void *data,
92 const char *name);
93
94 virtual void pcdata(XML_Parser_Class &c,
95 XML_Parser &p,
96 void *data,
97 const char *chars);
98 virtual void cdata(XML_Parser_Class &c,
99 XML_Parser &p,
100 void *data,
101 const char *chars);
102
103 virtual void processing(XML_Parser_Class &c,
104 XML_Parser &p,
105 void *data,
106 const char *instruction);
107 virtual void error(XML_Parser_Class &c,
108 XML_Parser &p,
109 void *data);
110 };
111
112 static void print_attributes(XML_Attribute_List &attributes)
113 {
114 XML_Attribute_List::Entries them;
115
116 for(them.begin(attributes); them ; them++)
117 printf(" %s='%s'",
118 (const char *)them->k,
119 (const char *)them->v);
120 }
121
122 EST_read_status apml_read(FILE *file,
123 const EST_String &name,
124 EST_Utterance &u,
125 int &max_id)
126 {
127 (void)max_id;
128 (void)print_attributes; // just to shut -Wall up.
129 Apml_Parser_Class pclass;
130 Parse_State state;
131
132 u.clear();
133
134 state.utt=&u;
135
136 XML_Parser *parser = pclass.make_parser(file, name, &state);
137 parser->track_context(TRUE);
138
139 CATCH_ERRORS()
140 return read_format_error;
141
142 parser->go();
143
144 END_CATCH_ERRORS();
145
146 return read_ok;
147 }
148
149
150
151 /** Now we define the callbacks.
152 */
153
154 void Apml_Parser_Class::document_open(XML_Parser_Class &c,
155 XML_Parser &p,
156 void *data)
157 {
158 (void)c; (void)p;
159 Parse_State *state = (Parse_State *)data;
160
161 state->maxid=0;
162
163 state->depth=1;
164 state->parent=NULL;
165 state->pending=NULL;
166 state->last_token=NULL;
167
168 // create relations:
169 state->perf = state->utt->create_relation("Perfomative");
170 state->com = state->utt->create_relation("Communicative");
171 state->tokens = state->utt->create_relation("Token");
172 state->semstruct = state->utt->create_relation("SemStructure");
173 state->emphasis = state->utt->create_relation("Emphasis");
174 state->boundary = state->utt->create_relation("Boundary");
175 state->pause = state->utt->create_relation("Pause");
176
177
178 }
179
180 void Apml_Parser_Class::document_close(XML_Parser_Class &c,
181 XML_Parser &p,
182 void *data)
183 {
184 (void)c; (void)p; (void)data;
185 }
186
187
188 void Apml_Parser_Class::element_open(XML_Parser_Class &c,
189 XML_Parser &p,
190 void *data,
191 const char *name,
192 XML_Attribute_List &attributes)
193 {
194 (void)c; (void)p; (void)attributes;
195 Parse_State *state = (Parse_State *)data;
196
197 //cout << " In element_open: " << name << "\n";
198
199 if (strcmp(name, "turnallocation")==0)
200 {
201 // currently ignore
202 return;
203 }
204
205 if (strcmp(name, "apml")==0)
206 return; // ignore
207
208 state->depth++;
209
210 if( strcmp(name, "performative")==0
211 || strcmp(name, "rheme")==0
212 || strcmp(name, "theme")==0
213 || strcmp(name, "emphasis")==0
214 || strcmp(name, "boundary")==0
215 || strcmp(name, "pause")==0)
216 {
217
218 // create new item content
219 EST_Item_Content *cont = new EST_Item_Content();
220 cont->set_name(name);
221
222 XML_Attribute_List::Entries them;
223 for(them.begin(attributes); them ; them++)
224 {
225 EST_String k = them->k;
226 EST_String v = them->v;
227 cont->f.set(k,v);
228 }
229
230 EST_Item *item;
231
232 if( strcmp(name, "emphasis")==0 )
233 {
234 item = state->emphasis->append();
235 state->pending = item;
236 }
237 else if(strcmp(name, "boundary")==0 )
238 {
239 item = state->boundary->append();
240 if(state->last_token)
241 item->append_daughter(state->last_token);
242 }
243 else if(strcmp(name, "pause")==0 )
244 {
245 item = state->pause->append();
246 if(state->last_token)
247 item->append_daughter(state->last_token);
248 }
249 else
250 {
251 if (state->parent == NULL)
252 item = state->semstruct->append();
253 else
254 item = state->parent->append_daughter();
255 state->parent=item;
256 }
257
258 item->set_contents(cont);
259
260
261 }
262 else
263 EST_warning("APML Parser: unknown element %s", name);
264 }
265
266
267 void Apml_Parser_Class::element(XML_Parser_Class &c,
268 XML_Parser &p,
269 void *data,
270 const char *name,
271 XML_Attribute_List &attributes)
272 {
273 (void)c; (void)p; (void)attributes;
274
275 element_open(c, p, data, name, attributes);
276 element_close(c, p, data, name);
277 }
278
279
280 void Apml_Parser_Class::element_close(XML_Parser_Class &c,
281 XML_Parser &p,
282 void *data,
283 const char *name)
284 {
285 (void)c; (void)p; (void)name;
286 Parse_State *state = (Parse_State *)data;
287
288 if ( strcmp(name, "emphasis")==0
289 || strcmp(name, "boundary")==0
290 || strcmp(name, "pause")==0 )
291 {
292 state->depth--;
293 state->pending=NULL;
294 }
295
296
297 if (strcmp(name, "performative")==0
298 || strcmp(name, "theme")==0
299 || strcmp(name, "rheme")==0)
300 {
301 state->depth--;
302 state->pending = NULL;
303 state->parent=iup(state->parent);
304 }
305 }
306
307
308 void Apml_Parser_Class::pcdata(XML_Parser_Class &c,
309 XML_Parser &p,
310 void *data,
311 const char *chars)
312 {
313 (void)c;
314
315 Parse_State *state = (Parse_State *)data;
316 EST_String strings[255];
317
318 split(chars,strings,255,RXwhite);
319
320 // for(int cc=0 ; cc < 20 ; ++cc)
321 // cout << cc << ": \"" << strings[cc] << "\" (" << strings[cc].length() << ")\n";
322
323 int s=0;
324
325 while( s < 1 || strings[s].length() > 0 )
326 {
327 if(strings[s].length() > 0 )
328 {
329 // Just Punctuation
330 if(strings[s].matches(RXpunc))
331 {
332 state->last_token->set("punc",strings[s]);
333 }
334 // Text and possibly punc
335 else
336 {
337 EST_Item_Content *cont = new EST_Item_Content();
338 EST_Item *item;
339
340 if (state->parent == NULL)
341 item = state->semstruct->append();
342 else
343 item = state->parent->append_daughter();
344 item->set_contents(cont);
345
346 // strip pre-punc here.
347 int i = strings[s].index(RXpunc);
348 EST_String ps = strings[s].at(RXpunc);
349 EST_String intermediate;
350 if( ps.length() > 0 && i == 0)
351 {
352 cout << "Got pre punc: " << ps << endl;
353 intermediate = strings[s].after(RXpunc);
354 // cont->set_name(strings[s].before(RXpunc));
355 item->set("prepunctuation",ps);
356 }
357 else
358 {
359 intermediate = strings[s];
360 item->set("prepunctuation","");
361 }
362 // now strip punc
363 ps = intermediate.at(RXpunc);
364 if( ps.length() > 0 )
365 {
366 cout << "Got punc: " << ps << endl;
367 cont->set_name(intermediate.before(RXpunc));
368 item->set("punc",ps);
369 }
370 else
371 {
372 cont->set_name(intermediate);
373 item->set("punc","");
374 }
375
376 state->tokens->append(item);
377 state->last_token = item;
378
379 if(state->pending)
380 {
381 state->pending->append_daughter(item);
382 }
383
384 // if (state->parent != NULL && p.context(0) == "w")
385 // state->parent->set(EST_String("token"), chars);
386
387 //cout << " got token: " << item->name() << "\n";
388 }
389 }
390 ++s;
391 }
392 }
393
394
395 void Apml_Parser_Class::cdata(XML_Parser_Class &c,
396 XML_Parser &p,
397 void *data,
398 const char *chars)
399 {
400 (void)c; (void)p; (void)data; (void)chars;
401 // Parse_State *state = (Parse_State *)data;
402
403 // printf("APML XML Parser [cdata[%s]] %d\n", chars, state->depth);
404 }
405
406
407 void Apml_Parser_Class::processing(XML_Parser_Class &c,
408 XML_Parser &p,
409 void *data,
410 const char *instruction)
411 {
412 (void)c; (void)p;
413 Parse_State *state = (Parse_State *)data;
414
415 printf("APML XML Parser [proc[%s]] %d\n", instruction, state->depth);
416 }
417
418
419 void Apml_Parser_Class::error(XML_Parser_Class &c,
420 XML_Parser &p,
421 void *data)
422 {
423 (void)c; (void)p; (void)data;
424 // Parse_State *state = (Parse_State *)data;
425
426 EST_error("APML Parser %s", get_error(p));
427
428 est_error_throw();
429 }
430
431
432
433
434
435
436