"Fossies" - the Fresh Open Source Software Archive

Member "speech_tools/sigpr/pda/pda.cc" (4 Sep 2017, 13461 Bytes) of package /linux/misc/speech_tools-2.5.0-release.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "pda.cc" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 2.4-release_vs_2.5.0-release.

    1 /*************************************************************************/
    2 /*                                                                       */
    3 /*                Centre for Speech Technology Research                  */
    4 /*                     University of Edinburgh, UK                       */
    5 /*                      Copyright (c) 1995,1996                          */
    6 /*                        All Rights Reserved.                           */
    7 /*                                                                       */
    8 /*  Permission is hereby granted, free of charge, to use and distribute  */
    9 /*  this software and its documentation without restriction, including   */
   10 /*  without limitation the rights to use, copy, modify, merge, publish,  */
   11 /*  distribute, sublicense, and/or sell copies of this work, and to      */
   12 /*  permit persons to whom this work is furnished to do so, subject to   */
   13 /*  the following conditions:                                            */
   14 /*   1. The code must retain the above copyright notice, this list of    */
   15 /*      conditions and the following disclaimer.                         */
   16 /*   2. Any modifications must be clearly marked as such.                */
   17 /*   3. Original authors' names are not deleted.                         */
   18 /*   4. The authors' names are not used to endorse or promote products   */
   19 /*      derived from this software without specific prior written        */
   20 /*      permission.                                                      */
   21 /*                                                                       */
   22 /*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
   23 /*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
   24 /*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
   25 /*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
   26 /*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
   27 /*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
   28 /*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
   29 /*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
   30 /*  THIS SOFTWARE.                                                       */
   31 /*                                                                       */
   32 /*************************************************************************/
   33 /*                   Author :  Paul Taylor                               */
   34 /*                   Date   :  April 1994                                */
   35 /*************************************************************************/
   36 
   37 #include "EST_speech_class.h"
   38 #include "sigpr/EST_sigpr_utt.h"
   39 #include "sigpr/EST_filter.h"
   40 #include "srpd.h"
   41 #include "EST_error.h"
   42 #include "EST_string_aux.h"
   43 
   44 int read_next_wave_segment (EST_Wave &sig, struct Srpd_Op *paras, 
   45                 SEGMENT_ *p_seg);
   46 
   47 static void srpd(EST_Wave &sig, EST_Track &fz, Srpd_Op &srpd_op, int resize);
   48 static struct Srpd_Op *default_srpd_op(struct Srpd_Op *srpd);
   49 static void parse_srpd_list(EST_Features &a_list, struct Srpd_Op *srpd);
   50 
   51 void pda(EST_Wave &sig, EST_Track &fz, EST_Features &op, EST_String method)
   52 {
   53     if (method == "")
   54     {
   55     if (op.present("pda_method"))
   56         method = op.S("pda_method");
   57     }
   58     if (method == "")   
   59     srpd(sig, fz, op);
   60     else if  (method == "srpd")
   61     srpd(sig, fz, op);
   62     else
   63     EST_error("Unknown pda %s\n", (const char *)method);
   64 }
   65 
   66 void icda(EST_Wave &sig, EST_Track &fz, EST_Track &speech, EST_Features &op, 
   67            EST_String method)
   68 { // intonation contour detection algorithm
   69     EST_Track raw_fz;
   70     if (method == "")
   71     {
   72     if (op.present("pda_method"))
   73         method = op.S("pda_method");
   74     }
   75     if (method == "")   
   76     srpd(sig, raw_fz, op);
   77     else if  (method == "srpd")
   78     srpd(sig, raw_fz, op);
   79     else
   80     EST_error("Unknown pda %s\n", (const char *)method);
   81 
   82     smooth_phrase(raw_fz, speech, op, fz);
   83 }
   84 
   85 void srpd(EST_Wave &sig, EST_Track &fz, EST_Features &op)
   86 {
   87     Srpd_Op srpd_op;
   88 
   89     default_srpd_op(&srpd_op); // default values
   90     parse_srpd_list(op, &srpd_op); // override with options
   91 
   92     if (op.I("do_low_pass",0))
   93     FIRlowpass_filter(sig, op.I("lpf_cutoff"),op.I("lpf_order"));
   94 
   95     srpd(sig, fz, srpd_op, op.I("srpd_resize", 0));
   96 }
   97 
   98 /*void do_srpd_fz(EST_Wave &sig, EST_Track &fz)
   99 {
  100     Srpd_Op srpd_op;
  101     default_srpd_op(&srpd_op);
  102     srpd(sig, fz, srpd_op, 1);
  103 }
  104 */
  105 
  106 void srpd(EST_Wave &sig, EST_Track &fz, Srpd_Op &srpd_op, int resize)
  107 {
  108     int i, rns, tracklen, j = 0;
  109     SEGMENT_ segment;
  110     CROSS_CORR_ cc;
  111     STATUS_ pda_status, held_status;
  112     srpd_op.sample_freq = sig.sample_rate();
  113 #if 0
  114     float min, max;
  115     min = srpd_op.min_pitch; // must store as set up routines corrupt
  116     max = srpd_op.max_pitch;
  117 #endif
  118 
  119     initialise_structures (&srpd_op, &segment, &cc);
  120     initialise_status (&srpd_op, &pda_status);
  121     initialise_status (&srpd_op, &held_status);
  122 
  123     tracklen = (sig.num_samples() - segment.length) / segment.shift + 1;
  124 
  125     if (resize)
  126     {
  127     fz.set_equal_space(true);
  128     fz.resize(tracklen, 1);
  129     fz.set_channel_name("F0", 0);
  130     fz.fill_time(srpd_op.shift/1000);
  131     }
  132 
  133     if (!fz.equal_space())
  134     EST_error("Pitch tracking algorithm must have equal spaced track\n");
  135     
  136     while ((rns = read_next_wave_segment (sig, &srpd_op, &segment)) != 0) 
  137     {
  138     if (rns == 2) 
  139     {
  140         for (i = 0; i < cc.size; cc.coeff[i++] = 0.0);
  141         initialise_status (&srpd_op, &pda_status);
  142     }
  143     else
  144         super_resolution_pda (&srpd_op, segment, &cc, &pda_status);
  145     if (pda_status.s_h == HOLD) 
  146     {
  147         held_status.pitch_freq = pda_status.pitch_freq;
  148         held_status.v_uv = VOICED;
  149         held_status.s_h = HELD;
  150         held_status.cc_max = pda_status.cc_max;
  151         held_status.threshold = pda_status.threshold;
  152         continue;
  153     }
  154     if (held_status.s_h == HELD) 
  155     {
  156         if (pda_status.pitch_freq == BREAK_NUMBER) 
  157         {
  158         held_status.pitch_freq = BREAK_NUMBER;
  159         held_status.v_uv = UNVOICED;
  160         }
  161         held_status.s_h = SENT;
  162         if (held_status.v_uv != VOICED) 
  163         fz.set_break(j);
  164         fz.a(j++) = held_status.pitch_freq;
  165         //    printf( "track set:  %d (of %d) to %f\n", j-1, fz.length(), held_status.pitch_freq );
  166     }
  167     if (pda_status.v_uv != VOICED) 
  168         fz.set_break(j);
  169     fz.a(j++) = pda_status.pitch_freq;
  170     //printf( "track set:  %d (of %d) to %f\n", j-1, fz.length(), pda_status.pitch_freq );
  171     }
  172     if (held_status.s_h == HELD) 
  173     {
  174     held_status.pitch_freq = BREAK_NUMBER;
  175     held_status.v_uv = UNVOICED;
  176     fz.set_break(j);
  177     fz.a(j++) = held_status.pitch_freq;
  178     }
  179     end_structure_use (&segment, &cc);
  180 }
  181 
  182 static struct Srpd_Op *default_srpd_op(struct Srpd_Op *srpd)
  183 { 
  184     srpd->L = DEFAULT_DECIMATION;
  185     srpd->min_pitch = DEFAULT_MIN_PITCH;
  186     srpd->max_pitch = DEFAULT_MAX_PITCH;
  187     srpd->shift = DEFAULT_SHIFT;
  188     srpd->length = DEFAULT_LENGTH;
  189     srpd->Tsilent = DEFAULT_TSILENT;
  190     srpd->Tmin = DEFAULT_TMIN;
  191     srpd->Tmax_ratio = DEFAULT_TMAX_RATIO;
  192     srpd->Thigh = DEFAULT_THIGH;
  193     srpd->Tdh = DEFAULT_TDH;
  194     srpd->make_ascii = 0;
  195     srpd->peak_tracking = 0;
  196     srpd->sample_freq = DEFAULT_SF;
  197       /* p_par->Nmax and p_par->Nmin cannot be initialised */
  198     return(srpd);
  199 }
  200 
  201 static void parse_srpd_list(EST_Features &al, struct Srpd_Op *srpd)
  202 { 
  203     if (al.present("decimation"))
  204     srpd->L = al.I("decimation");
  205     if (al.present("min_pitch"))
  206     srpd->min_pitch = al.F("min_pitch");
  207     if (al.present("max_pitch"))
  208     srpd->max_pitch = al.F("max_pitch");    
  209     if (al.present("pda_frame_shift"))
  210     srpd->shift = al.F("pda_frame_shift") * 1000.0;
  211     if (al.present("pda_frame_length"))
  212     srpd->length = al.F("pda_frame_length") * 1000.0;
  213     if (al.present("noise_floor"))
  214     srpd->Tsilent = al.I("noise_floor");
  215     if (al.present("v2uv_coeff_thresh"))
  216     srpd->Thigh = al.F("v2uv_coef_thresh");
  217     if (al.present("min_v2uv_coef_thresh"))
  218     srpd->Tmin = al.F("min_v2uv_coef_thresh");
  219     if (al.present("v2uv_coef_thresh_ratio"))
  220     srpd->Tmax_ratio = al.F("v2uv_coef_thresh_ratio");
  221     if (al.present("anti_doubling_thresh"))
  222     srpd->Tdh = al.F("anti_doubling_thresh");
  223     if (al.present("peak_tracking"))
  224     srpd->peak_tracking = al.I("peak_tracking");
  225     if (al.present("sample_frequency"))
  226     srpd->sample_freq = al.I("sample_frequency");
  227 }
  228 
  229 void default_pda_options(EST_Features &al)
  230 {
  231     al.set("min_pitch", "40.0");
  232     al.set("max_pitch", "400.0");
  233     al.set("pda_frame_shift", "0.005");
  234     al.set("pda_frame_length", DEFAULT_LENGTH / 1000.0);
  235     al.set("lpf_cutoff", "600");
  236     al.set("lpf_order", "49");
  237     al.set("f0_file_type", "esps");
  238     al.set("decimation", DEFAULT_DECIMATION);
  239     al.set("noise_floor", DEFAULT_TSILENT);
  240     al.set("min_v2uv_coef_thresh", DEFAULT_TMIN);
  241     al.set("v2uv_coef_thresh_ratio", DEFAULT_TMAX_RATIO);
  242     al.set("v2uv_coef_thresh", DEFAULT_THIGH);
  243     al.set("anti_doubling_thresh", DEFAULT_TDH);
  244     al.set("peak_tracking", 0);
  245 }
  246 
  247 EST_String options_pda_general(void)
  248 {
  249     // The standard waveform input options 
  250     return
  251     EST_String("")+
  252     "-L  Perform low pass filtering on input. This option should always \n"
  253     "    be used in normal processing as it usually increases \n"
  254     "    performance considerably\n\n"
  255     "-P  perform peak tracking\n\n" 
  256     "-fmin <float> miniumum F0 value. Sets the minimum allowed F0 in \n" 
  257     "    output track. Default is "+ftoString(DEFAULT_MIN_PITCH)+".\n "
  258     "    Changing this to suit the speaker usually increases  \n"
  259     "    performance. Typical recommended values are 60-90Hz for\n"
  260     "    males and 120-150Hz  for females\n\n"
  261     "-fmax <float> maxiumum F0 value. Sets the maximum allowed F0 in \n" 
  262     "    output track. Default is "+ftoString(DEFAULT_MAX_PITCH)+". \n"
  263     "    Changing this to suit the speaker usually increases \n"
  264     "    performance. Typical recommended values are 200Hz for \n"
  265     "    males and 300-400Hz for females\n\n"
  266     "-shift <float> frame spacing in seconds for fixed frame analysis. \n"
  267     "    This doesn't have to be the same as the output file spacing - \n"
  268     "    the -S option can be used to resample the track before saving \n"
  269     "    default: "+ftoString(DEFAULT_SHIFT/1000.0) +"\n\n"
  270     "-length <float> analysis frame length in seconds.\n"
  271     "    default: "+ftoString(DEFAULT_LENGTH/1000.0) +"\n\n"
  272     "-lpfilter <int>   Low pass filter, with cutoff frequency in Hz \n"
  273     "    Filtering is performed by a FIR filter which is built at run \n"
  274     "    time. The order of the filter can be given by -forder. The \n"
  275     "    default value is 199\n\n"
  276     "-forder <int>  Order of FIR filter used for lpfilter and \n"
  277     "    hpfilter. This must be ODD. Sensible values range \n"
  278     "    from 19 (quick but with a shallow rolloff) to 199 \n"
  279     "    (slow but with a steep rolloff). The default is 199.\n\n";
  280 }
  281 
  282 EST_String options_pda_srpd(void)
  283 {
  284     // The standard waveform input options 
  285     return
  286     EST_String("")+
  287     "-d <float> decimation factor\n"
  288     "    set down-sampling for quicker computation so that only one in \n"
  289     "    <parameter>decimation factor</parameter> samples are used in the first instance. \n"
  290     "    Must be in the range of one to ten inclusive. Default is four. \n"
  291     "    For data sampled at 10kHz, it is advised that a decimation \n"
  292     "    factor of two isselected.\n\n"
  293 
  294     "-n <float> Inoise floor.\n"
  295     "    Set the maximum absolute signal amplitude that represents  \n"
  296     "    silence to <parameter>Inoise floor</parameter>. If the absolute amplitude of \n"
  297     "    the first segment in a given frame is below this level at all \n"
  298     "    times, then the frame is classified as representing silence. \n"
  299     "    Must be a positive number. Default is 120 ADC units.\n\n"
  300 
  301     "-H <float> unvoiced to voiced coeff threshold\n"
  302     "    set the correlation coefficient threshold which must be \n"
  303     "    exceeded in a transition from an unvoiced classified frame \n"
  304     "    of speech to a voiced frame as the unvoiced to voiced coeff \n"
  305     "    threshold. Must be in the range zero to one inclusive. \n"
  306     "    Default is 0.88.\n\n"
  307 
  308     "-m <float> min voiced to unvoiced coeff threshold \n"
  309     "    set the minimum allowed correlation coefficient threshold \n"
  310     "    which must not be exceeded in a transition from a voiced \n"
  311     "    classified frame of speech to an unvoiced frame, as \n"
  312     "    <parameter>min voiced to unvoiced coeff threshold</parameter>. Must be in the \n"
  313     "    range zero to <parameter>unvoiced to voiced coeff threshold</parameter> \n"
  314     "    inclusive. Default is 0.75.\n\n"
  315 
  316     "-R <float> voiced to unvoiced coeff threshold-ratio  \n"
  317     "    set the scaling factor used in determining the correlation\n"
  318     "    coefficient threshold which must not be exceeded in a voiced \n"
  319     "    frame to unvoiced frame transition, as <parameter>voiced to unvoiced</parameter> \n"
  320     "    coeff threshold -ratio. The voiced to unvoiced coefficient \n"
  321     "    threshold is determined by multiplying this scaling factor \n"
  322     "    with the maximum cross-correlation coefficient of the \n"
  323     "    previously voiced frame. If this product is less than \n"
  324     "    <parameter>min voiced to unvoiced coeff threshold</parameter> then this is used \n"
  325     "    instead. Must be in the range zero to one inclusive. \n"
  326     "     Default is 0.85.\n\n"
  327 
  328     "-t <float> anti pitch doubling/halving threshold\n"
  329     "    set the threshold used in eliminating (as far as possible) \n"
  330     "    pitch doubling and pitch halving errors as <parameter>anti pitch \n"
  331     "    double/halving threshold</parameter>. Must be in the range zero to \n"
  332     "    one inclusive. Default is 0.77.\n\n";
  333 }
  334 
  335