include/espeak/src/translate.h

   1 /***************************************************************************
   2  *   Copyright (C) 2005 to 2013 by Jonathan Duddington                     *
   3  *   email: jonsd@users.sourceforge.net                                    *
   4  *                                                                         *
   5  *   This program is free software; you can redistribute it and/or modify  *
   6  *   it under the terms of the GNU General Public License as published by  *
   7  *   the Free Software Foundation; either version 3 of the License, or     *
   8  *   (at your option) any later version.                                   *
   9  *                                                                         *
  10  *   This program is distributed in the hope that it will be useful,       *
  11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
  12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
  13  *   GNU General Public License for more details.                          *
  14  *                                                                         *
  15  *   You should have received a copy of the GNU General Public License     *
  16  *   along with this program; if not, see:                                 *
  17  *               <http://www.gnu.org/licenses/>.                           *
  18  ***************************************************************************/
  19
  20
  21 #define L(c1,c2)  (c1<<8)+c2          // combine two characters into an integer for translator name
  22
  23 #define CTRL_EMBEDDED    0x01         // control character at the start of an embedded command
  24 #define REPLACED_E       'E'          // 'e' replaced by silent e
  25
  26 #define N_WORD_PHONEMES  200          // max phonemes in a word
  27 #define N_WORD_BYTES     160          // max bytes for the UTF8 characters in a word
  28 #define N_CLAUSE_WORDS   300          // max words in a clause
  29 #define N_RULE_GROUP2    120          // max num of two-letter rule chains
  30 #define N_HASH_DICT     1024
  31 #define N_CHARSETS        20
  32 #define N_LETTER_GROUPS   95          // maximum is 127-32
  33
  34
  35 /* dictionary flags, word 1 */
  36 // bits 0-3  stressed syllable,  bit 6=unstressed
  37 #define FLAG_SKIPWORDS        0x80
  38 #define FLAG_PREPAUSE        0x100
  39
  40 #define FLAG_STRESS_END      0x200  // full stress if at end of clause
  41 #define FLAG_STRESS_END2     0x400  // full stress if at end of clause, or only followed by unstressed
  42 #define FLAG_UNSTRESS_END    0x800  // reduce stress at end of clause
  43 #define FLAG_SPELLWORD      0x1000  // re-translate the word as individual letters, separated by spaces
  44 #define FLAG_ABBREV         0x2000  // spell as letters, even with a vowel, OR use specified pronunciation rather than split into letters
  45 #define FLAG_DOUBLING       0x4000  // doubles the following consonant
  46
  47 #define BITNUM_FLAG_ALT         14  // bit number of FLAG_ALT_TRANS - 1
  48 #define FLAG_ALT_TRANS      0x8000  // language specific
  49 #define FLAG_ALT2_TRANS    0x10000  // language specific
  50 #define FLAG_ALT3_TRANS    0x20000  // language specific
  51 #define FLAG_ALT4_TRANS    0x40000  // language specific
  52 #define FLAG_ALT5_TRANS    0x80000  // language specific
  53 #define FLAG_ALT6_TRANS   0x100000  // language specific
  54
  55 #define FLAG_COMBINE      0x800000  // combine with the next word
  56 #define FLAG_ALLOW_DOT  0x01000000  // ignore '.' after word (abbreviation)
  57 #define FLAG_NEEDS_DOT  0x02000000  // only if the word is followed by a dot
  58 #define FLAG_WAS_UNPRONOUNCABLE  0x04000000  // the unpronounceable routine was used
  59 #define FLAG_MAX3       0x08000000  // limit to 3 repeats
  60 #define FLAG_PAUSE1     0x10000000  // shorter prepause
  61 #define FLAG_TEXTMODE   0x20000000  // word translates to replacement text, not phonemes
  62 #define BITNUM_FLAG_TEXTMODE    29
  63
  64 #define FLAG_FOUND_ATTRIBUTES     0x40000000  // word was found in the dictionary list (has attributes)
  65 #define FLAG_FOUND      0x80000000  // pronunciation was found in the dictionary list
  66
  67 // dictionary flags, word 2
  68 #define FLAG_VERBF             0x1  /* verb follows */
  69 #define FLAG_VERBSF            0x2  /* verb follows, may have -s suffix */
  70 #define FLAG_NOUNF             0x4  /* noun follows */
  71 #define FLAG_PASTF             0x8  /* past tense follows */
  72 #define FLAG_VERB             0x10  /* pronunciation for verb */
  73 #define FLAG_NOUN             0x20  /* pronunciation for noun */
  74 #define FLAG_PAST             0x40  /* pronunciation for past tense */
  75 #define FLAG_VERB_EXT        0x100  /* extend the 'verb follows' */
  76 #define FLAG_CAPITAL         0x200  /* pronunciation if initial letter is upper case */
  77 #define FLAG_ALLCAPS         0x400  // only if the word is all capitals
  78 #define FLAG_ACCENT          0x800  // character name is base-character name + accent name
  79 #define FLAG_HYPHENATED     0x1000  // multiple-words, but needs hyphen between parts 1 and 2
  80 #define FLAG_SENTENCE       0x2000  // only if the clause is a sentence
  81 #define FLAG_ONLY           0x4000
  82 #define FLAG_ONLY_S         0x8000
  83 #define FLAG_STEM          0x10000  // must have a suffix
  84 #define FLAG_ATEND         0x20000  // use this pronunciation if at end of clause
  85 #define FLAG_ATSTART       0x40000  // use this pronunciation if at start of clause
  86 #define FLAG_NATIVE        0x80000  // not if we've switched translators
  87 #define FLAG_LOOKUP_SYMBOL 0x40000000  // to indicate called from Lookup()
  88
  89 #define BITNUM_FLAG_ALLCAPS   0x2a
  90 #define BITNUM_FLAG_HYPHENATED  0x2c
  91 #define BITNUM_FLAG_ONLY      0x2e
  92 #define BITNUM_FLAG_ONLY_S    0x2f
  93
  94
  95 // wordflags, flags in source word
  96 #define FLAG_ALL_UPPER     0x1    /* no lower case letters in the word */
  97 #define FLAG_FIRST_UPPER   0x2    /* first letter is upper case */
  98 #define FLAG_UPPERS        0x3    // FLAG_ALL_UPPER | FLAG_FIRST_UPPER
  99 #define FLAG_HAS_PLURAL    0x4    /* upper-case word with s or 's lower-case ending */
 100 #define FLAG_PHONEMES      0x8    /* word is phonemes */
 101 #define FLAG_LAST_WORD     0x10   /* last word in clause */
 102 #define FLAG_EMBEDDED      0x40   /* word is preceded by embedded commands */
 103 #define FLAG_HYPHEN        0x80
 104 #define FLAG_NOSPACE       0x100  // word is not seperated from previous word by a space
 105 #define FLAG_FIRST_WORD    0x200  // first word in clause
 106 #define FLAG_FOCUS         0x400   // the focus word of a clause
 107 #define FLAG_EMPHASIZED    0x800
 108 #define FLAG_EMPHASIZED2   0xc00  // FLAG_FOCUS | FLAG_EMPHASIZED
 109 #define FLAG_DONT_SWITCH_TRANSLATOR  0x1000
 110 #define FLAG_SUFFIX_REMOVED  0x2000
 111 #define FLAG_HYPHEN_AFTER    0x4000
 112 #define FLAG_ORDINAL       0x8000   // passed to TranslateNumber() to indicate an ordinal number
 113 #define FLAG_HAS_DOT       0x10000  // dot after this word
 114 #define FLAG_COMMA_AFTER   0x20000  // comma after this word
 115 #define FLAG_MULTIPLE_SPACES 0x40000  // word is preceded by multiple spaces, newline, or tab
 116 #define FLAG_INDIVIDUAL_DIGITS 0x80000  // speak number as individual digits
 117 #define FLAG_DELETE_WORD     0x100000   // don't speak this word, it has been spoken as part of the previous word
 118 #define FLAG_CHAR_REPLACED   0x200000   // characters have been replaced by .replace in the *_rules
 119 #define FLAG_TRANSLATOR2     0x400000   // retranslating using a different language
 120
 121 #define FLAG_SUFFIX_VOWEL  0x08000000   // remember an initial vowel from the suffix
 122 #define FLAG_NO_TRACE      0x10000000   // passed to TranslateRules() to suppress dictionary lookup printout
 123 #define FLAG_NO_PREFIX     0x20000000
 124 #define FLAG_UNPRON_TEST   0x80000000   // do unpronounability test on the beginning of the word
 125
 126
 127 // prefix/suffix flags (bits 8 to 14, bits 16 to 22) don't use 0x8000, 0x800000
 128 #define SUFX_E        0x0100   // e may have been added
 129 #define SUFX_I        0x0200   // y may have been changed to i
 130 #define SUFX_P        0x0400   // prefix
 131 #define SUFX_V        0x0800   // suffix means use the verb form pronunciation
 132 #define SUFX_D        0x1000   // previous letter may have been doubled
 133 #define SUFX_F        0x2000   // verb follows
 134 #define SUFX_Q        0x4000   // don't retranslate
 135 #define SUFX_T        0x10000   // don't affect the stress position in the stem
 136 #define SUFX_B        0x20000  // break, this character breaks the word into stem and suffix (used with SUFX_P)
 137 #define SUFX_A        0x40000  // remember that the suffix starts with a vowel
 138 #define SUFX_M        0x80000  // bit 19, allow multiple suffixes
 139
 140 #define SUFX_UNPRON     0x8000   // used to return $unpron flag from *_rules
 141
 142
 143 #define FLAG_ALLOW_TEXTMODE  0x02  // allow dictionary to translate to text rather than phonemes
 144 #define FLAG_SUFX       0x04
 145 #define FLAG_SUFX_S     0x08
 146 #define FLAG_SUFX_E_ADDED 0x10
 147
 148
 149 // codes in dictionary rules
 150 #define RULE_PRE                        1
 151 #define RULE_POST                       2
 152 #define RULE_PHONEMES   3
 153 #define RULE_PH_COMMON  4       // At start of rule. Its phoneme string is used by subsequent rules
 154 #define RULE_CONDITION  5       // followed by condition number (byte)
 155 #define RULE_GROUP_START 6
 156 #define RULE_GROUP_END  7
 157 #define RULE_PRE_ATSTART 8   // as RULE_PRE but also match with 'start of word'
 158 #define RULE_LINENUM            9  // next 2 bytes give a line number, for debugging purposes
 159
 160 #define RULE_SPACE              32   // ascii space
 161 #define RULE_SYLLABLE   21    // @
 162 #define RULE_STRESSED   10   // &
 163 #define RULE_DOUBLE             11   // %
 164 #define RULE_INC_SCORE  12   // +
 165 #define RULE_DEL_FWD            13   // #
 166 #define RULE_ENDING             14   // S
 167 #define RULE_DIGIT              15   // D digit
 168 #define RULE_NONALPHA   16   // Z non-alpha
 169 #define RULE_LETTERGP   17   // A B C H F G Y   letter group number
 170 #define RULE_LETTERGP2  18   // L + letter group number
 171 #define RULE_CAPITAL    19   // !   word starts with a capital letter
 172 #define RULE_REPLACEMENTS 20  // section for character replacements
 173 #define RULE_SKIPCHARS  23   // J
 174 #define RULE_NO_SUFFIX  24   // N
 175 #define RULE_NOTVOWEL   25   // K
 176 #define RULE_IFVERB     26   // V
 177 #define RULE_DOLLAR     28   // $ commands
 178 #define RULE_NOVOWELS   29   // X no vowels up to word boundary
 179 #define RULE_SPELLING   31   // W while spelling letter-by-letter
 180 #define RULE_LAST_RULE   31
 181
 182 #define LETTERGP_A      0
 183 #define LETTERGP_B      1
 184 #define LETTERGP_C      2
 185 #define LETTERGP_H      3
 186 #define LETTERGP_F      4
 187 #define LETTERGP_G      5
 188 #define LETTERGP_Y      6
 189 #define LETTERGP_VOWEL2   7
 190
 191
 192 // Punctuation types  returned by ReadClause()
 193 // bits 0-7 pause x 10mS, bits 12-14 intonation type,
 194 // bits12-14 intonation type
 195 // bit 15- don't need space after the punctuation
 196 // bit 19=sentence, bit 18=clause,  bits 17=voice change
 197 // bit 16 used to distinguish otherwise identical types
 198 // bit 20= punctuation character can be inside a word (Armenian)
 199 // bit 21= speak the name of the punctuation character
 200 // bit 22= dot after the last word
 201 #define CLAUSE_BIT_SENTENCE  0x80000
 202 #define CLAUSE_BIT_CLAUSE    0x40000
 203 #define CLAUSE_BIT_VOICE     0x20000
 204 #define CLAUSE_BITS_INTONATION 0x7000
 205 #define PUNCT_IN_WORD        0x100000
 206 #define PUNCT_SAY_NAME       0x200000
 207 #define CLAUSE_DOT           0x400000
 208
 209 #define CLAUSE_NONE        ( 0 + 0x04000)
 210 #define CLAUSE_PARAGRAPH   (70 + 0x80000)
 211 #define CLAUSE_EOF         (40 + 0x90000)
 212 #define CLAUSE_VOICE       ( 0 + 0x24000)
 213 #define CLAUSE_PERIOD      (40 + 0x80000)
 214 #define CLAUSE_COMMA       (20 + 0x41000)
 215 #define CLAUSE_SHORTCOMMA  ( 4 + 0x41000)
 216 #define CLAUSE_SHORTFALL   ( 4 + 0x40000)
 217 #define CLAUSE_QUESTION    (40 + 0x82000)
 218 #define CLAUSE_EXCLAMATION (45 + 0x83000)
 219 #define CLAUSE_COLON       (30 + 0x40000)
 220 #define CLAUSE_SEMICOLON   (30 + 0x41000)
 221
 222 #define SAYAS_CHARS     0x12
 223 #define SAYAS_GLYPHS    0x13
 224 #define SAYAS_SINGLE_CHARS 0x14
 225 #define SAYAS_KEY       0x24
 226 #define SAYAS_DIGITS    0x40  // + number of digits
 227 #define SAYAS_DIGITS1   0xc1
 228
 229 #define CHAR_EMPHASIS   0x0530  // this is an unused character code
 230 #define CHAR_COMMA_BREAK  0x0557  // unused character code
 231
 232 // Rule:
 233 // [4] [match] [1 pre] [2 post] [3 phonemes] 0
 234 //     match 1 pre 2 post 0     - use common phoneme string
 235 //     match 1 pre 2 post 3 0   - empty phoneme string
 236
 237 typedef const char *  constcharptr;
 238
 239 typedef struct {
 240         int  points;
 241         const char *phonemes;
 242         int  end_type;
 243         char *del_fwd;
 244 } MatchRecord;
 245
 246
 247 // used to mark words with the source[] buffer
 248 typedef struct{
 249         unsigned int flags;
 250         unsigned short start;
 251         unsigned char pre_pause;
 252         unsigned char wmark;
 253         unsigned short sourceix;
 254         unsigned char length;
 255 } WORD_TAB;
 256
 257
 258 typedef struct {
 259         int type;
 260         int parameter[N_SPEECH_PARAM];
 261 } PARAM_STACK;
 262
 263 extern PARAM_STACK param_stack[];
 264 extern const int param_defaults[N_SPEECH_PARAM];
 265
 266
 267 typedef struct {
 268     const char *name;
 269     int offset;
 270     unsigned short range_min, range_max;
 271     int language;
 272     int flags;
 273 } ALPHABET;
 274
 275 extern ALPHABET alphabets[];
 276 extern ALPHABET *current_alphabet;
 277 // alphabet flags
 278 #define AL_DONT_NAME  0x01    // don't speak the alphabet name
 279 #define AL_NOT_LETTERS  0x02  // don't use the language for speaking letters
 280 #define AL_WORDS      0x04    // use the language to speak words
 281 #define AL_NOT_CODE   0x08    // don't speak the character code
 282 #define AL_NO_SYMBOL  0x10    // don't repeat "symbol" or "character"
 283
 284
 285 #define N_LOPTS      21
 286 #define LOPT_DIERESES        1
 287  // 1=remove [:] from unstressed syllables, 2= remove from unstressed or non-penultimate syllables
 288  // bit 4=0, if stress < 4,  bit 4=1, if not the highest stress in the word
 289 #define LOPT_IT_LENGTHEN        2
 290
 291  // 1=german
 292 #define LOPT_PREFIXES        3
 293
 294  // non-zero, change voiced/unoiced to match last consonant in a cluster
 295  // bit 0=use regressive voicing
 296  // bit 1=LANG=cz,bg  don't propagate over [v]
 297  // bit 2=don't propagate acress word boundaries
 298  // bit 3=LANG=pl,  propagate over liquids and nasals
 299  // bit 4=LANG=cz,sk  don't progagate to [v]
 300  // bit 8=devoice word-final consonants
 301 #define LOPT_REGRESSIVE_VOICING  4
 302
 303  // 0=default, 1=no check, other allow this character as an extra initial letter (default is 's')
 304 #define LOPT_UNPRONOUNCABLE  5
 305
 306  // select length_mods tables,  (length_mod_tab) + (length_mod_tab0 * 100)
 307 #define LOPT_LENGTH_MODS    6
 308
 309  // increase this to prevent sonorants being shortened before shortened (eg. unstressed) vowels
 310 #define LOPT_SONORANT_MIN    7
 311
 312  // bit 0: don't break vowels at word boundary
 313 #define LOPT_WORD_MERGE      8
 314
 315  // max. amplitude for vowel at the end of a clause
 316 #define LOPT_MAXAMP_EOC      9
 317
 318  // bit 0=reduce even if phonemes are specified in the **_list file
 319  // bit 1=don't reduce the strongest vowel in a word which is marked 'unstressed'
 320 #define LOPT_REDUCE  10
 321
 322  // LANG=cs,sk  combine some prepositions with the following word, if the combination has N or fewer syllables
 323  // bits 0-3  N syllables
 324  // bit 4=only if the second word has $alt attribute
 325  // bit 5=not if the second word is end-of-sentence
 326 #define LOPT_COMBINE_WORDS 11
 327
 328  // change [t] when followed by unstressed vowel
 329 #define LOPT_REDUCE_T 12
 330
 331  // 1 = allow capitals inside a word
 332  // 2 = stressed syllable is indicated by capitals
 333 #define LOPT_CAPS_IN_WORD  13
 334
 335  // bit 0=Italian "syntactic doubling" of consoants in the word after a word marked with $double attribute
 336  // bit 1=also after a word which ends with a stressed vowel
 337 #define LOPT_IT_DOUBLING    14
 338
 339   // Call ApplySpecialAttributes() if $alt or $alt2 is set for a word
 340   // bit 1: stressed syllable: $alt change [e],[o] to [E],[O],  $alt2 change [E],[O] to [e],[o]
 341 #define LOPT_ALT  15
 342
 343   // pause for bracket (default=4), pause when annoucing bracket names (default=2)
 344 #define LOPT_BRACKET_PAUSE 16
 345
 346         // bit 1, don't break clause before annoucning . ? !
 347 #define LOPT_ANNOUNCE_PUNCT 17
 348
 349         // recognize long vowels (0 = don't recognize)
 350 #define LOPT_LONG_VOWEL_THRESHOLD 18
 351
 352         // bit 0:  Don't allow suffices if there is no previous syllable
 353 #define LOPT_SUFFIX  19
 354
 355         // bit 0  Apostrophe at start of word is part of the word
 356         // bit 1  Apostrophe at end of word is part of the word
 357 #define LOPT_APOSTROPHE  20
 358
 359
 360 // stress_rule
 361 #define STRESSPOSN_1L   0       // 1st syllable
 362 #define STRESSPOSN_2L   1       // 2nd syllable
 363 #define STRESSPOSN_2R   2       // penultimate
 364 #define STRESSPOSN_1R   3       // final syllable
 365 #define STRESSPOSN_3R   4       // antipenultimate
 366
 367
 368 typedef struct {
 369 // bits0-2  separate words with (1=pause_vshort, 2=pause_short, 3=pause, 4=pause_long 5=[?] phonemme)
 370 // bit 3=don't use linking phoneme
 371 // bit4=longer pause before STOP, VSTOP,FRIC
 372 // bit5=length of a final vowel doesn't depend on the next phoneme
 373         int word_gap;
 374         int vowel_pause;
 375         int stress_rule; // 1=first syllable, 2=penultimate,  3=last
 376
 377 #define S_NO_DIM            0x02
 378 #define S_FINAL_DIM         0x04
 379 #define S_FINAL_DIM_ONLY    0x06
 380 // bit1=don't set diminished stress,
 381 // bit2=mark unstressed final syllables as diminished
 382
 383 // bit3=set consecutive unstressed syllables in unstressed words to diminished, but not in stressed words
 384
 385 #define S_FINAL_NO_2        0x10
 386 // bit4=don't allow secondary stress on last syllable
 387
 388 #define S_NO_AUTO_2         0x20
 389 // bit5-don't use automatic secondary stress
 390
 391 #define S_2_TO_HEAVY        0x40
 392 // bit6=light syllable followed by heavy, move secondary stress to the heavy syllable. LANG=Finnish
 393
 394 #define S_FIRST_PRIMARY     0x80
 395 // bit7=if more than one primary stress, make the subsequent primaries to secondary stress
 396
 397 #define S_FINAL_STRESS_C    0x100
 398 // bit8=stress last syllable if it doesn't end in a vowel
 399
 400 #define S_FINAL_SPANISH     0x200
 401 // bit9=stress last syllable if it doesn't end in vowel or "s" or "n"  LANG=Spanish
 402
 403 #define S_2_SYL_2           0x1000
 404 // bit12= In a 2-syllable word, if one has primary stress then give the other secondary stress
 405
 406 #define S_INITIAL_2         0x2000
 407 // bit13= If there is only one syllable before the primary stress, give it a secondary stress
 408
 409 #define S_MID_DIM           0x10000
 410 // bit 16= Set (not first or last) syllables to diminished stress
 411
 412 #define S_PRIORITY_STRESS   0x20000
 413 // bit17= "priority" stress reduces other primary stress to "unstressed" not "secondary"
 414
 415 #define S_EO_CLAUSE1        0x40000
 416 // bit18= don't lengthen short vowels more than long vowels at end-of-clause
 417
 418 #define S_FINAL_LONG         0x80000
 419 // bit19=stress on final syllable if it has a long vowel, but previous syllable has a short vowel
 420
 421
 422 #define S_HYPEN_UNSTRESS    0x100000
 423 // bit20= hyphenated words, 2nd part is unstressed
 424
 425 #define S_NO_EOC_LENGTHEN   0x200000
 426 // bit21= don't lengthen vowels at end-of-clause
 427
 428 // bit15= Give stress to the first unstressed syllable
 429
 430
 431         int stress_flags;
 432         int unstressed_wd1; // stress for $u word of 1 syllable
 433         int unstressed_wd2; // stress for $u word of >1 syllable
 434         int param[N_LOPTS];
 435         int param2[N_LOPTS];
 436         unsigned char *length_mods;
 437         unsigned char *length_mods0;
 438
 439 #define NUM_THOUS_SPACE  0x4
 440 #define NUM_DECIMAL_COMMA 0x8
 441 #define NUM_SWAP_TENS    0x10
 442 #define NUM_AND_UNITS    0x20
 443 #define NUM_HUNDRED_AND  0x40
 444 #define NUM_SINGLE_AND   0x80
 445 #define NUM_SINGLE_STRESS 0x100
 446 #define NUM_SINGLE_VOWEL 0x200
 447 #define NUM_OMIT_1_HUNDRED 0x400
 448 #define NUM_1900         0x800
 449 #define NUM_ALLOW_SPACE  0x1000
 450 #define NUM_DFRACTION_1  0x2000
 451 #define NUM_DFRACTION_2  0x4000
 452 #define NUM_DFRACTION_3  0x6000
 453 #define NUM_DFRACTION_4  0x8000
 454 #define NUM_DFRACTION_5  0xa000
 455 #define NUM_DFRACTION_6  0xc000
 456 #define NUM_DFRACTION_7  0xe000    // lang=si, alternative form of number for decimal fraction digits (except the last)
 457 #define NUM_ORDINAL_DOT   0x10000
 458 #define NUM_NOPAUSE       0x20000
 459 #define NUM_AND_HUNDRED   0x40000
 460 #define NUM_THOUSAND_AND  0x80000
 461 #define NUM_VIGESIMAL       0x100000
 462 #define NUM_OMIT_1_THOUSAND 0x200000
 463 #define NUM_ZERO_HUNDRED    0x400000
 464 #define NUM_HUNDRED_AND_DIGIT   0x800000
 465 #define NUM_ROMAN          0x1000000
 466 #define NUM_ROMAN_CAPITALS 0x2000000
 467 #define NUM_ROMAN_AFTER    0x4000000
 468 #define NUM_ROMAN_ORDINAL  0x8000000
 469 #define NUM_SINGLE_STRESS_L  0x10000000
 470
 471         // bits0-1=which numbers routine to use.
 472         // bit2=  thousands separator must be space
 473         // bit3=  , decimal separator, not .
 474         // bit4=use three-and-twenty rather than twenty-three
 475         // bit5='and' between tens and units
 476         // bit6=add "and" after hundred or thousand
 477         // bit7=don't have "and" both after hundreds and also between tens and units
 478    // bit8=only one primary stress in tens+units
 479         // bit9=only one vowel betwen tens and units
 480         // bit10=omit "one" before "hundred"
 481         // bit11=say 19** as nineteen hundred
 482         // bit12=allow space as thousands separator (in addition to langopts.thousands_sep)
 483         // bits13-15  post-decimal-digits 0=single digits, 1=(LANG=it) 2=(LANG=pl) 3=(LANG=ro)
 484
 485         // bit16= dot after number indicates ordinal
 486         // bit17= don't add pause after a number
 487         // bit18= 'and' before hundreds
 488         // bit19= 'and' after thousands if there are no hundreds
 489         // bit20= vigesimal number, if tens are not found
 490         // bit21= omit "one" before "thousand"
 491         // bit22= say "zero" before hundred
 492         // bit23= add "and" after hundreds and thousands, only if there are digits and no tens
 493
 494         // bit24= recognize roman numbers
 495         // bit25= Roman numbers only if upper case
 496         // bit26= say "roman" after the number, not before
 497         // bit27= Roman numbers are ordinal numbers
 498    // bit28= only one primary stress in tens+units (on the tens)
 499         int numbers;
 500
 501 #define NUM2_THOUSANDS_VAR1     0x40
 502 #define NUM2_THOUSANDS_VAR2     0x80
 503 #define NUM2_THOUSANDS_VAR3     0xc0
 504 #define NUM2_THOUSANDS_VAR4     0x100
 505 #define NUM2_THOUSANDS_VAR5     0x140
 506
 507 #define NUM2_ORDINAL_NO_AND     0x800
 508 #define NUM2_MULTIPLE_ORDINAL   0x1000
 509 #define NUM2_NO_TEEN_ORDINALS   0x2000
 510 #define NUM2_MYRIADS            0x4000
 511 #define NUM2_ENGLISH_NUMERALS   0x8000
 512 #define NUM2_PERCENT_BEFORE     0x10000
 513         // bits 1-4  use variant form of numbers before thousands,millions,etc.
 514         // bits 6-8  use different forms of thousand, million, etc (M MA MB)
 515         // bit9=(LANG=rw) say "thousand" and "million" before its number, not after
 516         // bit11=(LANG=es,an) don't say 'and' between tens and units for ordinal numbers
 517         // bit12=(LANG=el,es) use ordinal form of hundreds and tens as well as units
 518         // bit13=(LANG=pt) don't use 11-19 numbers to make ordinals
 519         // bit14=(LANG=ko)  use myriads (groups of 4 digits) not thousands (groups of 3)
 520         // bit15=(LANG=ne)  speak (non-replaced) English numerals in English
 521         // bit16=(LANG=si)  say "%" before the number
 522         int numbers2;
 523
 524 #define BREAK_THOUSANDS   0x49249248
 525         int break_numbers;  // which digits to break the number into thousands, millions, etc (Hindi has 100,000 not 1,000,000)
 526         int max_roman;
 527         int min_roman;
 528         int thousands_sep;
 529         int decimal_sep;
 530         int max_digits;    // max number of digits which can be spoken as an integer number (rather than individual digits)
 531         const char *ordinal_indicator;   // UTF-8 string
 532
 533         // bit 0, accent name before the letter name, bit 1 "capital" after letter name
 534         int accents;
 535
 536         int tone_language;          // 1=tone language
 537         int intonation_group;
 538         unsigned char tunes[6];
 539         int long_stop;          // extra mS pause for a lengthened stop
 540         int phoneme_change;     // TEST, change phonemes, after translation
 541         char max_initial_consonants;
 542         char spelling_stress;   // 0=default, 1=stress first letter
 543         char tone_numbers;
 544         char ideographs;      // treat as separate words
 545         char textmode;          // the meaning of FLAG_TEXTMODE is reversed (to save data when *_list file is compiled)
 546         char dotless_i;         // uses letter U+0131
 547         int testing;            // testing options: bit 1= specify stressed syllable in the form:  "outdoor/2"
 548         int listx;    // compile *_listx after *list
 549         const unsigned int *replace_chars;      // characters to be substitutes
 550         char ascii_language[8];  // switch to this language for Latin characters
 551         int our_alphabet;           // offset for main alphabet (if not set in letter_bits_offset)
 552         int alt_alphabet;       // offset for another language to recognize
 553         int alt_alphabet_lang;  // language for the alt_alphabet
 554         int max_lengthmod;
 555         int lengthen_tonic;   // lengthen the tonic syllable
 556         int suffix_add_e;      // replace a suffix (which has the SUFX_E flag) with this character
 557 } LANGUAGE_OPTIONS;
 558
 559
 560 // a parameter of ChangePhonemes()
 561 typedef struct {
 562         int flags;
 563         unsigned char stress;          // stress level of this vowel
 564         unsigned char stress_highest;  // the highest stress level of a vowel in this word
 565         unsigned char n_vowels;        // number of vowels in the word
 566         unsigned char vowel_this;      // syllable number of this vowel (counting from 1)
 567         unsigned char vowel_stressed;  // syllable number of the highest stressed vowel
 568 } CHANGEPH;
 569
 570
 571
 572 typedef struct
 573 {//===========
 574
 575         LANGUAGE_OPTIONS langopts;
 576         int translator_name;
 577         int transpose_max;
 578         int transpose_min;
 579         const char *transpose_map;
 580         char dictionary_name[40];
 581
 582         char phon_out[500];
 583         char phonemes_repeat[20];
 584         int  phonemes_repeat_count;
 585         int  phoneme_tab_ix;
 586
 587         unsigned char stress_amps[8];
 588         unsigned char stress_amps_r[8];
 589         short stress_lengths[8];
 590         int dict_condition;    // conditional apply some pronunciation rules and dict.lookups
 591         int dict_min_size;
 592         const unsigned short *charset_a0;   // unicodes for characters 0xa0 to oxff
 593         const wchar_t *char_plus_apostrophe;  // single chars + apostrophe treated as words
 594         const wchar_t *punct_within_word;   // allow these punctuation characters within words
 595         const unsigned short *chars_ignore;
 596
 597 // holds properties of characters: vowel, consonant, etc for pronunciation rules
 598         unsigned char letter_bits[256];
 599         int letter_bits_offset;
 600         const wchar_t *letter_groups[8];
 601
 602         /* index1=option, index2 by 0=. 1=, 2=?, 3=! 4=none */
 603 #define INTONATION_TYPES 8
 604 #define PUNCT_INTONATIONS 6
 605         unsigned char punct_to_tone[INTONATION_TYPES][PUNCT_INTONATIONS];
 606
 607         char *data_dictrules;     // language_1   translation rules file
 608         char *data_dictlist;      // language_2   dictionary lookup file
 609         char *dict_hashtab[N_HASH_DICT];   // hash table to index dictionary lookup file
 610         char *letterGroups[N_LETTER_GROUPS];
 611
 612         // groups1 and groups2 are indexes into data_dictrules, set up by InitGroups()
 613         // the two-letter rules for each letter must be consecutive in the language_rules source
 614
 615         char *groups1[256];         // translation rule lists, index by single letter
 616         char *groups3[128];         // index by offset letter
 617         char *groups2[N_RULE_GROUP2];   // translation rule lists, indexed by two-letter pairs
 618         unsigned int groups2_name[N_RULE_GROUP2];  // the two letter pairs for groups2[]
 619         int n_groups2;              // number of groups2[] entries used
 620
 621         unsigned char groups2_count[256];    // number of 2 letter groups for this initial letter
 622         unsigned char groups2_start[256];    // index into groups2
 623         const short *frequent_pairs;   // list of frequent pairs of letters, for use in compressed *_list
 624
 625         int expect_verb;
 626         int expect_past;    // expect past tense
 627         int expect_verb_s;
 628         int expect_noun;
 629         int prev_last_stress;
 630         char *clause_end;
 631
 632         int word_vowel_count;     // number of vowels so far
 633         int word_stressed_count;  // number of vowels so far which could be stressed
 634
 635         int clause_upper_count;   // number of upper case letters in the clause
 636         int clause_lower_count;   // number of lower case letters in the clause
 637
 638         int prepause_timeout;
 639         int end_stressed_vowel;  // word ends with stressed vowel
 640         int prev_dict_flags[2];     // dictionary flags from previous word
 641         int clause_terminator;
 642 } Translator;
 643
 644
 645 extern int option_tone2;
 646 #define OPTION_EMPHASIZE_ALLCAPS  0x100
 647 #define OPTION_EMPHASIZE_PENULTIMATE 0x200
 648 extern int option_tone_flags;
 649 extern int option_waveout;
 650 extern int option_quiet;
 651 extern int option_phonemes;
 652 extern int option_mbrola_phonemes;
 653 extern int option_phoneme_events;
 654 extern int option_linelength;     // treat lines shorter than this as end-of-clause
 655 extern int option_multibyte;
 656 extern int option_capitals;
 657 extern int option_punctuation;
 658 extern int option_endpause;
 659 extern int option_ssml;
 660 extern int option_phoneme_input;   // allow [[phonemes]] in input text
 661 extern int option_phoneme_variants;
 662 extern int option_sayas;
 663 extern int option_wordgap;
 664
 665 extern int count_characters;
 666 extern int count_words;
 667 extern int count_sentences;
 668 extern int skip_characters;
 669 extern int skip_words;
 670 extern int skip_sentences;
 671 extern int skipping_text;
 672 extern int end_character_position;
 673 extern int clause_start_char;
 674 extern int clause_start_word;
 675 extern char *namedata;
 676 extern int pre_pause;
 677
 678
 679
 680 #define N_MARKER_LENGTH 50   // max.length of a mark name
 681 extern char skip_marker[N_MARKER_LENGTH];
 682
 683 #define N_PUNCTLIST  60
 684 extern wchar_t option_punctlist[N_PUNCTLIST];  // which punctuation characters to announce
 685 extern unsigned char punctuation_to_tone[INTONATION_TYPES][PUNCT_INTONATIONS];
 686
 687 extern Translator *translator;
 688 extern Translator *translator2;
 689 extern const unsigned short *charsets[N_CHARSETS];
 690 extern char dictionary_name[40];
 691 extern char ctrl_embedded;    // to allow an alternative CTRL for embedded commands
 692 extern unsigned char *p_textinput;
 693 extern wchar_t *p_wchar_input;
 694 extern int dictionary_skipwords;
 695
 696 extern int (* uri_callback)(int, const char *, const char *);
 697 extern int (* phoneme_callback)(const char *);
 698 extern void SetLengthMods(Translator *tr, int value);
 699
 700 void LoadConfig(void);
 701 int TransposeAlphabet(Translator *tr, char *text);
 702 int utf8_in(int *c, const char *buf);
 703 int utf8_in2(int *c, const char *buf, int backwards);
 704 int utf8_out(unsigned int c, char *buf);
 705 int utf8_nbytes(const char *buf);
 706 int lookupwchar(const unsigned short *list,int c);
 707 int lookupwchar2(const unsigned short *list,int c);
 708 int Eof(void);
 709 char *strchr_w(const char *s, int c);
 710 int IsBracket(int c);
 711 void InitNamedata(void);
 712 void InitText(int flags);
 713 void InitText2(void);
 714 int IsDigit(unsigned int c);
 715 int IsDigit09(unsigned int c);
 716 int IsAlpha(unsigned int c);
 717 int IsVowel(Translator *tr, int c);
 718 int iswalpha2(int c);
 719 int isspace2(unsigned int c);
 720 int iswlower2(int c);
 721 int iswupper2(int c);
 722 int towlower2(unsigned int c);
 723 int towupper2(unsigned int c);
 724 void GetTranslatedPhonemeString(char *phon_out, int n_phon_out, int phoneme_mode);
 725 const char *WordToString2(unsigned int word);
 726 ALPHABET *AlphabetFromChar(int c);
 727 ALPHABET *AlphabetFromName(const char *name);
 728
 729 Translator *SelectTranslator(const char *name);
 730 int SetTranslator2(const char *name);
 731 void DeleteTranslator(Translator *tr);
 732 int Lookup(Translator *tr, const char *word, char *ph_out);
 733 int LookupFlags(Translator *tr, const char *word);
 734
 735 int TranslateNumber(Translator *tr, char *word1, char *ph_out, unsigned int *flags, WORD_TAB *wtab, int control);
 736 int TranslateRoman(Translator *tr, char *word, char *ph_out, WORD_TAB *wtab);
 737
 738 void ChangeWordStress(Translator *tr, char *word, int new_stress);
 739 void SetSpellingStress(Translator *tr, char *phonemes, int control, int n_chars);
 740 int TranslateLetter(Translator *tr, char *letter, char *phonemes, int control);
 741 void LookupLetter(Translator *tr, unsigned int letter, int next_byte, char *ph_buf, int control);
 742 void LookupAccentedLetter(Translator *tr, unsigned int letter, char *ph_buf);
 743
 744 int LoadDictionary(Translator *tr, const char *name, int no_error);
 745 int LookupDictList(Translator *tr, char **wordptr, char *ph_out, unsigned int *flags, int end_flags, WORD_TAB *wtab);
 746
 747 void MakePhonemeList(Translator *tr, int post_pause, int new_sentence);
 748 int ChangePhonemes_ru(Translator *tr, PHONEME_LIST2 *phlist, int n_ph, int index, PHONEME_TAB *ph, CHANGEPH *ch);
 749 void ApplySpecialAttribute2(Translator *tr, char *phonemes, int dict_flags);
 750 void AppendPhonemes(Translator *tr, char *string, int size, const char *ph);
 751
 752 void CalcLengths(Translator *tr);
 753 void CalcPitches(Translator *tr, int clause_tone);
 754
 755 int RemoveEnding(Translator *tr, char *word, int end_type, char *word_copy);
 756 int Unpronouncable(Translator *tr, char *word, int posn);
 757 void SetWordStress(Translator *tr, char *output, unsigned int *dictionary_flags, int tonic, int prev_stress);
 758 int TranslateRules(Translator *tr, char *p, char *phonemes, int size, char *end_phonemes, int end_flags, unsigned int *dict_flags);
 759 int TranslateWord(Translator *tr, char *word1, int next_pause, WORD_TAB *wtab, char *word_out);
 760 void *TranslateClause(Translator *tr, FILE *f_text, const void *vp_input, int *tone, char **voice_change);
 761 int ReadClause(Translator *tr, FILE *f_in, char *buf, short *charix, int *charix_top, int n_buf, int *tone_type, char *voice_change);
 762
 763 void SetVoiceStack(espeak_VOICE *v, const char *variant_name);
 764 void InterpretPhoneme(Translator *tr, int control, PHONEME_LIST *plist, PHONEME_DATA *phdata, WORD_PH_DATA *worddata);
 765 void InterpretPhoneme2(int phcode, PHONEME_DATA *phdata);
 766 char *WritePhMnemonic(char *phon_out, PHONEME_TAB *ph, PHONEME_LIST *plist, int use_ipa, int *flags);
 767
 768 extern FILE *f_trans;           // for logging
 769 extern FILE *f_logespeak;
 770 extern int logging_type;  // from config file