"Fossies" - the Fresh Open Source Software Archive

Member "ffmpeg-3.4.2/libavcodec/mips/aaccoder_mips.c" (31 Dec 2017, 105585 Bytes) of package /linux/misc/ffmpeg-3.4.2.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "aaccoder_mips.c" see the Fossies "Dox" file reference documentation.

    1 /*
    2  * Copyright (c) 2012
    3  *      MIPS Technologies, Inc., California.
    4  *
    5  * Redistribution and use in source and binary forms, with or without
    6  * modification, are permitted provided that the following conditions
    7  * are met:
    8  * 1. Redistributions of source code must retain the above copyright
    9  *    notice, this list of conditions and the following disclaimer.
   10  * 2. Redistributions in binary form must reproduce the above copyright
   11  *    notice, this list of conditions and the following disclaimer in the
   12  *    documentation and/or other materials provided with the distribution.
   13  * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
   14  *    contributors may be used to endorse or promote products derived from
   15  *    this software without specific prior written permission.
   16  *
   17  * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
   18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
   21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   27  * SUCH DAMAGE.
   28  *
   29  * Author:  Stanislav Ocovaj (socovaj@mips.com)
   30  *          Szabolcs Pal     (sabolc@mips.com)
   31  *
   32  * AAC coefficients encoder optimized for MIPS floating-point architecture
   33  *
   34  * This file is part of FFmpeg.
   35  *
   36  * FFmpeg is free software; you can redistribute it and/or
   37  * modify it under the terms of the GNU Lesser General Public
   38  * License as published by the Free Software Foundation; either
   39  * version 2.1 of the License, or (at your option) any later version.
   40  *
   41  * FFmpeg is distributed in the hope that it will be useful,
   42  * but WITHOUT ANY WARRANTY; without even the implied warranty of
   43  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   44  * Lesser General Public License for more details.
   45  *
   46  * You should have received a copy of the GNU Lesser General Public
   47  * License along with FFmpeg; if not, write to the Free Software
   48  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   49  */
   50 
   51 /**
   52  * @file
   53  * Reference: libavcodec/aaccoder.c
   54  */
   55 
   56 #include "libavutil/libm.h"
   57 
   58 #include <float.h>
   59 #include "libavutil/mathematics.h"
   60 #include "libavcodec/avcodec.h"
   61 #include "libavcodec/put_bits.h"
   62 #include "libavcodec/aac.h"
   63 #include "libavcodec/aacenc.h"
   64 #include "libavcodec/aactab.h"
   65 #include "libavcodec/aacenctab.h"
   66 #include "libavcodec/aacenc_utils.h"
   67 
   68 #if HAVE_INLINE_ASM
   69 #if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
   70 typedef struct BandCodingPath {
   71     int prev_idx;
   72     float cost;
   73     int run;
   74 } BandCodingPath;
   75 
   76 static const uint8_t uquad_sign_bits[81] = {
   77     0, 1, 1, 1, 2, 2, 1, 2, 2,
   78     1, 2, 2, 2, 3, 3, 2, 3, 3,
   79     1, 2, 2, 2, 3, 3, 2, 3, 3,
   80     1, 2, 2, 2, 3, 3, 2, 3, 3,
   81     2, 3, 3, 3, 4, 4, 3, 4, 4,
   82     2, 3, 3, 3, 4, 4, 3, 4, 4,
   83     1, 2, 2, 2, 3, 3, 2, 3, 3,
   84     2, 3, 3, 3, 4, 4, 3, 4, 4,
   85     2, 3, 3, 3, 4, 4, 3, 4, 4
   86 };
   87 
   88 static const uint8_t upair7_sign_bits[64] = {
   89     0, 1, 1, 1, 1, 1, 1, 1,
   90     1, 2, 2, 2, 2, 2, 2, 2,
   91     1, 2, 2, 2, 2, 2, 2, 2,
   92     1, 2, 2, 2, 2, 2, 2, 2,
   93     1, 2, 2, 2, 2, 2, 2, 2,
   94     1, 2, 2, 2, 2, 2, 2, 2,
   95     1, 2, 2, 2, 2, 2, 2, 2,
   96     1, 2, 2, 2, 2, 2, 2, 2,
   97 };
   98 
   99 static const uint8_t upair12_sign_bits[169] = {
  100     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  101     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  102     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  103     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  104     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  105     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  106     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  107     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  108     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  109     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  110     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  111     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  112     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
  113 };
  114 
  115 static const uint8_t esc_sign_bits[289] = {
  116     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  117     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  118     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  119     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  120     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  121     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  122     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  123     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  124     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  125     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  126     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  127     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  128     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  129     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  130     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  131     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  132     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
  133 };
  134 
  135 /**
  136  * Functions developed from template function and optimized for quantizing and encoding band
  137  */
  138 static void quantize_and_encode_band_cost_SQUAD_mips(struct AACEncContext *s,
  139                                                      PutBitContext *pb, const float *in, float *out,
  140                                                      const float *scaled, int size, int scale_idx,
  141                                                      int cb, const float lambda, const float uplim,
  142                                                      int *bits, float *energy, const float ROUNDING)
  143 {
  144     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  145     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
  146     int i;
  147     int qc1, qc2, qc3, qc4;
  148     float qenergy = 0.0f;
  149 
  150     uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
  151     uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
  152     float    *p_vec   = (float    *)ff_aac_codebook_vectors[cb-1];
  153 
  154     abs_pow34_v(s->scoefs, in, size);
  155     scaled = s->scoefs;
  156     for (i = 0; i < size; i += 4) {
  157         int curidx;
  158         int *in_int = (int *)&in[i];
  159         int t0, t1, t2, t3, t4, t5, t6, t7;
  160         const float *vec;
  161 
  162         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
  163         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  164         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  165         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  166 
  167         __asm__ volatile (
  168             ".set push                      \n\t"
  169             ".set noreorder                 \n\t"
  170 
  171             "slt    %[qc1], $zero,  %[qc1]  \n\t"
  172             "slt    %[qc2], $zero,  %[qc2]  \n\t"
  173             "slt    %[qc3], $zero,  %[qc3]  \n\t"
  174             "slt    %[qc4], $zero,  %[qc4]  \n\t"
  175             "lw     %[t0],  0(%[in_int])    \n\t"
  176             "lw     %[t1],  4(%[in_int])    \n\t"
  177             "lw     %[t2],  8(%[in_int])    \n\t"
  178             "lw     %[t3],  12(%[in_int])   \n\t"
  179             "srl    %[t0],  %[t0],  31      \n\t"
  180             "srl    %[t1],  %[t1],  31      \n\t"
  181             "srl    %[t2],  %[t2],  31      \n\t"
  182             "srl    %[t3],  %[t3],  31      \n\t"
  183             "subu   %[t4],  $zero,  %[qc1]  \n\t"
  184             "subu   %[t5],  $zero,  %[qc2]  \n\t"
  185             "subu   %[t6],  $zero,  %[qc3]  \n\t"
  186             "subu   %[t7],  $zero,  %[qc4]  \n\t"
  187             "movn   %[qc1], %[t4],  %[t0]   \n\t"
  188             "movn   %[qc2], %[t5],  %[t1]   \n\t"
  189             "movn   %[qc3], %[t6],  %[t2]   \n\t"
  190             "movn   %[qc4], %[t7],  %[t3]   \n\t"
  191 
  192             ".set pop                       \n\t"
  193 
  194             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  195               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  196               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  197               [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
  198             : [in_int]"r"(in_int)
  199             : "memory"
  200         );
  201 
  202         curidx = qc1;
  203         curidx *= 3;
  204         curidx += qc2;
  205         curidx *= 3;
  206         curidx += qc3;
  207         curidx *= 3;
  208         curidx += qc4;
  209         curidx += 40;
  210 
  211         put_bits(pb, p_bits[curidx], p_codes[curidx]);
  212 
  213         if (out || energy) {
  214             float e1,e2,e3,e4;
  215             vec = &p_vec[curidx*4];
  216             e1 = vec[0] * IQ;
  217             e2 = vec[1] * IQ;
  218             e3 = vec[2] * IQ;
  219             e4 = vec[3] * IQ;
  220             if (out) {
  221                 out[i+0] = e1;
  222                 out[i+1] = e2;
  223                 out[i+2] = e3;
  224                 out[i+3] = e4;
  225             }
  226             if (energy)
  227                 qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
  228         }
  229     }
  230     if (energy)
  231         *energy = qenergy;
  232 }
  233 
  234 static void quantize_and_encode_band_cost_UQUAD_mips(struct AACEncContext *s,
  235                                                      PutBitContext *pb, const float *in, float *out,
  236                                                      const float *scaled, int size, int scale_idx,
  237                                                      int cb, const float lambda, const float uplim,
  238                                                      int *bits, float *energy, const float ROUNDING)
  239 {
  240     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  241     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
  242     int i;
  243     int qc1, qc2, qc3, qc4;
  244     float qenergy = 0.0f;
  245 
  246     uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
  247     uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
  248     float    *p_vec   = (float    *)ff_aac_codebook_vectors[cb-1];
  249 
  250     abs_pow34_v(s->scoefs, in, size);
  251     scaled = s->scoefs;
  252     for (i = 0; i < size; i += 4) {
  253         int curidx, sign, count;
  254         int *in_int = (int *)&in[i];
  255         uint8_t v_bits;
  256         unsigned int v_codes;
  257         int t0, t1, t2, t3, t4;
  258         const float *vec;
  259 
  260         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
  261         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  262         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  263         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  264 
  265         __asm__ volatile (
  266             ".set push                              \n\t"
  267             ".set noreorder                         \n\t"
  268 
  269             "ori    %[t4],      $zero,      2       \n\t"
  270             "ori    %[sign],    $zero,      0       \n\t"
  271             "slt    %[t0],      %[t4],      %[qc1]  \n\t"
  272             "slt    %[t1],      %[t4],      %[qc2]  \n\t"
  273             "slt    %[t2],      %[t4],      %[qc3]  \n\t"
  274             "slt    %[t3],      %[t4],      %[qc4]  \n\t"
  275             "movn   %[qc1],     %[t4],      %[t0]   \n\t"
  276             "movn   %[qc2],     %[t4],      %[t1]   \n\t"
  277             "movn   %[qc3],     %[t4],      %[t2]   \n\t"
  278             "movn   %[qc4],     %[t4],      %[t3]   \n\t"
  279             "lw     %[t0],      0(%[in_int])        \n\t"
  280             "lw     %[t1],      4(%[in_int])        \n\t"
  281             "lw     %[t2],      8(%[in_int])        \n\t"
  282             "lw     %[t3],      12(%[in_int])       \n\t"
  283             "slt    %[t0],      %[t0],      $zero   \n\t"
  284             "movn   %[sign],    %[t0],      %[qc1]  \n\t"
  285             "slt    %[t1],      %[t1],      $zero   \n\t"
  286             "slt    %[t2],      %[t2],      $zero   \n\t"
  287             "slt    %[t3],      %[t3],      $zero   \n\t"
  288             "sll    %[t0],      %[sign],    1       \n\t"
  289             "or     %[t0],      %[t0],      %[t1]   \n\t"
  290             "movn   %[sign],    %[t0],      %[qc2]  \n\t"
  291             "slt    %[t4],      $zero,      %[qc1]  \n\t"
  292             "slt    %[t1],      $zero,      %[qc2]  \n\t"
  293             "slt    %[count],   $zero,      %[qc3]  \n\t"
  294             "sll    %[t0],      %[sign],    1       \n\t"
  295             "or     %[t0],      %[t0],      %[t2]   \n\t"
  296             "movn   %[sign],    %[t0],      %[qc3]  \n\t"
  297             "slt    %[t2],      $zero,      %[qc4]  \n\t"
  298             "addu   %[count],   %[count],   %[t4]   \n\t"
  299             "addu   %[count],   %[count],   %[t1]   \n\t"
  300             "sll    %[t0],      %[sign],    1       \n\t"
  301             "or     %[t0],      %[t0],      %[t3]   \n\t"
  302             "movn   %[sign],    %[t0],      %[qc4]  \n\t"
  303             "addu   %[count],   %[count],   %[t2]   \n\t"
  304 
  305             ".set pop                               \n\t"
  306 
  307             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  308               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  309               [sign]"=&r"(sign), [count]"=&r"(count),
  310               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  311               [t4]"=&r"(t4)
  312             : [in_int]"r"(in_int)
  313             : "memory"
  314         );
  315 
  316         curidx = qc1;
  317         curidx *= 3;
  318         curidx += qc2;
  319         curidx *= 3;
  320         curidx += qc3;
  321         curidx *= 3;
  322         curidx += qc4;
  323 
  324         v_codes = (p_codes[curidx] << count) | (sign & ((1 << count) - 1));
  325         v_bits  = p_bits[curidx] + count;
  326         put_bits(pb, v_bits, v_codes);
  327 
  328         if (out || energy) {
  329             float e1,e2,e3,e4;
  330             vec = &p_vec[curidx*4];
  331             e1 = copysignf(vec[0] * IQ, in[i+0]);
  332             e2 = copysignf(vec[1] * IQ, in[i+1]);
  333             e3 = copysignf(vec[2] * IQ, in[i+2]);
  334             e4 = copysignf(vec[3] * IQ, in[i+3]);
  335             if (out) {
  336                 out[i+0] = e1;
  337                 out[i+1] = e2;
  338                 out[i+2] = e3;
  339                 out[i+3] = e4;
  340             }
  341             if (energy)
  342                 qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
  343         }
  344     }
  345     if (energy)
  346         *energy = qenergy;
  347 }
  348 
  349 static void quantize_and_encode_band_cost_SPAIR_mips(struct AACEncContext *s,
  350                                                      PutBitContext *pb, const float *in, float *out,
  351                                                      const float *scaled, int size, int scale_idx,
  352                                                      int cb, const float lambda, const float uplim,
  353                                                      int *bits, float *energy, const float ROUNDING)
  354 {
  355     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  356     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
  357     int i;
  358     int qc1, qc2, qc3, qc4;
  359     float qenergy = 0.0f;
  360 
  361     uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
  362     uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
  363     float    *p_vec   = (float    *)ff_aac_codebook_vectors[cb-1];
  364 
  365     abs_pow34_v(s->scoefs, in, size);
  366     scaled = s->scoefs;
  367     for (i = 0; i < size; i += 4) {
  368         int curidx, curidx2;
  369         int *in_int = (int *)&in[i];
  370         uint8_t v_bits;
  371         unsigned int v_codes;
  372         int t0, t1, t2, t3, t4, t5, t6, t7;
  373         const float *vec1, *vec2;
  374 
  375         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
  376         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  377         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  378         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  379 
  380         __asm__ volatile (
  381             ".set push                      \n\t"
  382             ".set noreorder                 \n\t"
  383 
  384             "ori    %[t4],  $zero,  4       \n\t"
  385             "slt    %[t0],  %[t4],  %[qc1]  \n\t"
  386             "slt    %[t1],  %[t4],  %[qc2]  \n\t"
  387             "slt    %[t2],  %[t4],  %[qc3]  \n\t"
  388             "slt    %[t3],  %[t4],  %[qc4]  \n\t"
  389             "movn   %[qc1], %[t4],  %[t0]   \n\t"
  390             "movn   %[qc2], %[t4],  %[t1]   \n\t"
  391             "movn   %[qc3], %[t4],  %[t2]   \n\t"
  392             "movn   %[qc4], %[t4],  %[t3]   \n\t"
  393             "lw     %[t0],  0(%[in_int])    \n\t"
  394             "lw     %[t1],  4(%[in_int])    \n\t"
  395             "lw     %[t2],  8(%[in_int])    \n\t"
  396             "lw     %[t3],  12(%[in_int])   \n\t"
  397             "srl    %[t0],  %[t0],  31      \n\t"
  398             "srl    %[t1],  %[t1],  31      \n\t"
  399             "srl    %[t2],  %[t2],  31      \n\t"
  400             "srl    %[t3],  %[t3],  31      \n\t"
  401             "subu   %[t4],  $zero,  %[qc1]  \n\t"
  402             "subu   %[t5],  $zero,  %[qc2]  \n\t"
  403             "subu   %[t6],  $zero,  %[qc3]  \n\t"
  404             "subu   %[t7],  $zero,  %[qc4]  \n\t"
  405             "movn   %[qc1], %[t4],  %[t0]   \n\t"
  406             "movn   %[qc2], %[t5],  %[t1]   \n\t"
  407             "movn   %[qc3], %[t6],  %[t2]   \n\t"
  408             "movn   %[qc4], %[t7],  %[t3]   \n\t"
  409 
  410             ".set pop                       \n\t"
  411 
  412             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  413               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  414               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  415               [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
  416             : [in_int]"r"(in_int)
  417             : "memory"
  418         );
  419 
  420         curidx = 9 * qc1;
  421         curidx += qc2 + 40;
  422 
  423         curidx2 = 9 * qc3;
  424         curidx2 += qc4 + 40;
  425 
  426         v_codes = (p_codes[curidx] << p_bits[curidx2]) | (p_codes[curidx2]);
  427         v_bits  = p_bits[curidx] + p_bits[curidx2];
  428         put_bits(pb, v_bits, v_codes);
  429 
  430         if (out || energy) {
  431             float e1,e2,e3,e4;
  432             vec1 = &p_vec[curidx*2 ];
  433             vec2 = &p_vec[curidx2*2];
  434             e1 = vec1[0] * IQ;
  435             e2 = vec1[1] * IQ;
  436             e3 = vec2[0] * IQ;
  437             e4 = vec2[1] * IQ;
  438             if (out) {
  439                 out[i+0] = e1;
  440                 out[i+1] = e2;
  441                 out[i+2] = e3;
  442                 out[i+3] = e4;
  443             }
  444             if (energy)
  445                 qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
  446         }
  447     }
  448     if (energy)
  449         *energy = qenergy;
  450 }
  451 
  452 static void quantize_and_encode_band_cost_UPAIR7_mips(struct AACEncContext *s,
  453                                                       PutBitContext *pb, const float *in, float *out,
  454                                                       const float *scaled, int size, int scale_idx,
  455                                                       int cb, const float lambda, const float uplim,
  456                                                       int *bits, float *energy, const float ROUNDING)
  457 {
  458     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  459     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
  460     int i;
  461     int qc1, qc2, qc3, qc4;
  462     float qenergy = 0.0f;
  463 
  464     uint8_t  *p_bits  = (uint8_t*) ff_aac_spectral_bits[cb-1];
  465     uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
  466     float    *p_vec   = (float    *)ff_aac_codebook_vectors[cb-1];
  467 
  468     abs_pow34_v(s->scoefs, in, size);
  469     scaled = s->scoefs;
  470     for (i = 0; i < size; i += 4) {
  471         int curidx1, curidx2, sign1, count1, sign2, count2;
  472         int *in_int = (int *)&in[i];
  473         uint8_t v_bits;
  474         unsigned int v_codes;
  475         int t0, t1, t2, t3, t4;
  476         const float *vec1, *vec2;
  477 
  478         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
  479         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  480         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  481         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  482 
  483         __asm__ volatile (
  484             ".set push                              \n\t"
  485             ".set noreorder                         \n\t"
  486 
  487             "ori    %[t4],      $zero,      7       \n\t"
  488             "ori    %[sign1],   $zero,      0       \n\t"
  489             "ori    %[sign2],   $zero,      0       \n\t"
  490             "slt    %[t0],      %[t4],      %[qc1]  \n\t"
  491             "slt    %[t1],      %[t4],      %[qc2]  \n\t"
  492             "slt    %[t2],      %[t4],      %[qc3]  \n\t"
  493             "slt    %[t3],      %[t4],      %[qc4]  \n\t"
  494             "movn   %[qc1],     %[t4],      %[t0]   \n\t"
  495             "movn   %[qc2],     %[t4],      %[t1]   \n\t"
  496             "movn   %[qc3],     %[t4],      %[t2]   \n\t"
  497             "movn   %[qc4],     %[t4],      %[t3]   \n\t"
  498             "lw     %[t0],      0(%[in_int])        \n\t"
  499             "lw     %[t1],      4(%[in_int])        \n\t"
  500             "lw     %[t2],      8(%[in_int])        \n\t"
  501             "lw     %[t3],      12(%[in_int])       \n\t"
  502             "slt    %[t0],      %[t0],      $zero   \n\t"
  503             "movn   %[sign1],   %[t0],      %[qc1]  \n\t"
  504             "slt    %[t2],      %[t2],      $zero   \n\t"
  505             "movn   %[sign2],   %[t2],      %[qc3]  \n\t"
  506             "slt    %[t1],      %[t1],      $zero   \n\t"
  507             "sll    %[t0],      %[sign1],   1       \n\t"
  508             "or     %[t0],      %[t0],      %[t1]   \n\t"
  509             "movn   %[sign1],   %[t0],      %[qc2]  \n\t"
  510             "slt    %[t3],      %[t3],      $zero   \n\t"
  511             "sll    %[t0],      %[sign2],   1       \n\t"
  512             "or     %[t0],      %[t0],      %[t3]   \n\t"
  513             "movn   %[sign2],   %[t0],      %[qc4]  \n\t"
  514             "slt    %[count1],  $zero,      %[qc1]  \n\t"
  515             "slt    %[t1],      $zero,      %[qc2]  \n\t"
  516             "slt    %[count2],  $zero,      %[qc3]  \n\t"
  517             "slt    %[t2],      $zero,      %[qc4]  \n\t"
  518             "addu   %[count1],  %[count1],  %[t1]   \n\t"
  519             "addu   %[count2],  %[count2],  %[t2]   \n\t"
  520 
  521             ".set pop                               \n\t"
  522 
  523             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  524               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  525               [sign1]"=&r"(sign1), [count1]"=&r"(count1),
  526               [sign2]"=&r"(sign2), [count2]"=&r"(count2),
  527               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  528               [t4]"=&r"(t4)
  529             : [in_int]"r"(in_int)
  530             : "t0", "t1", "t2", "t3", "t4",
  531               "memory"
  532         );
  533 
  534         curidx1  = 8 * qc1;
  535         curidx1 += qc2;
  536 
  537         v_codes = (p_codes[curidx1] << count1) | sign1;
  538         v_bits  = p_bits[curidx1] + count1;
  539         put_bits(pb, v_bits, v_codes);
  540 
  541         curidx2  = 8 * qc3;
  542         curidx2 += qc4;
  543 
  544         v_codes = (p_codes[curidx2] << count2) | sign2;
  545         v_bits  = p_bits[curidx2] + count2;
  546         put_bits(pb, v_bits, v_codes);
  547 
  548         if (out || energy) {
  549             float e1,e2,e3,e4;
  550             vec1 = &p_vec[curidx1*2];
  551             vec2 = &p_vec[curidx2*2];
  552             e1 = copysignf(vec1[0] * IQ, in[i+0]);
  553             e2 = copysignf(vec1[1] * IQ, in[i+1]);
  554             e3 = copysignf(vec2[0] * IQ, in[i+2]);
  555             e4 = copysignf(vec2[1] * IQ, in[i+3]);
  556             if (out) {
  557                 out[i+0] = e1;
  558                 out[i+1] = e2;
  559                 out[i+2] = e3;
  560                 out[i+3] = e4;
  561             }
  562             if (energy)
  563                 qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
  564         }
  565     }
  566     if (energy)
  567         *energy = qenergy;
  568 }
  569 
  570 static void quantize_and_encode_band_cost_UPAIR12_mips(struct AACEncContext *s,
  571                                                        PutBitContext *pb, const float *in, float *out,
  572                                                        const float *scaled, int size, int scale_idx,
  573                                                        int cb, const float lambda, const float uplim,
  574                                                        int *bits, float *energy, const float ROUNDING)
  575 {
  576     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  577     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
  578     int i;
  579     int qc1, qc2, qc3, qc4;
  580     float qenergy = 0.0f;
  581 
  582     uint8_t  *p_bits  = (uint8_t*) ff_aac_spectral_bits[cb-1];
  583     uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
  584     float    *p_vec   = (float   *)ff_aac_codebook_vectors[cb-1];
  585 
  586     abs_pow34_v(s->scoefs, in, size);
  587     scaled = s->scoefs;
  588     for (i = 0; i < size; i += 4) {
  589         int curidx1, curidx2, sign1, count1, sign2, count2;
  590         int *in_int = (int *)&in[i];
  591         uint8_t v_bits;
  592         unsigned int v_codes;
  593         int t0, t1, t2, t3, t4;
  594         const float *vec1, *vec2;
  595 
  596         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
  597         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
  598         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
  599         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
  600 
  601         __asm__ volatile (
  602             ".set push                              \n\t"
  603             ".set noreorder                         \n\t"
  604 
  605             "ori    %[t4],      $zero,      12      \n\t"
  606             "ori    %[sign1],   $zero,      0       \n\t"
  607             "ori    %[sign2],   $zero,      0       \n\t"
  608             "slt    %[t0],      %[t4],      %[qc1]  \n\t"
  609             "slt    %[t1],      %[t4],      %[qc2]  \n\t"
  610             "slt    %[t2],      %[t4],      %[qc3]  \n\t"
  611             "slt    %[t3],      %[t4],      %[qc4]  \n\t"
  612             "movn   %[qc1],     %[t4],      %[t0]   \n\t"
  613             "movn   %[qc2],     %[t4],      %[t1]   \n\t"
  614             "movn   %[qc3],     %[t4],      %[t2]   \n\t"
  615             "movn   %[qc4],     %[t4],      %[t3]   \n\t"
  616             "lw     %[t0],      0(%[in_int])        \n\t"
  617             "lw     %[t1],      4(%[in_int])        \n\t"
  618             "lw     %[t2],      8(%[in_int])        \n\t"
  619             "lw     %[t3],      12(%[in_int])       \n\t"
  620             "slt    %[t0],      %[t0],      $zero   \n\t"
  621             "movn   %[sign1],   %[t0],      %[qc1]  \n\t"
  622             "slt    %[t2],      %[t2],      $zero   \n\t"
  623             "movn   %[sign2],   %[t2],      %[qc3]  \n\t"
  624             "slt    %[t1],      %[t1],      $zero   \n\t"
  625             "sll    %[t0],      %[sign1],   1       \n\t"
  626             "or     %[t0],      %[t0],      %[t1]   \n\t"
  627             "movn   %[sign1],   %[t0],      %[qc2]  \n\t"
  628             "slt    %[t3],      %[t3],      $zero   \n\t"
  629             "sll    %[t0],      %[sign2],   1       \n\t"
  630             "or     %[t0],      %[t0],      %[t3]   \n\t"
  631             "movn   %[sign2],   %[t0],      %[qc4]  \n\t"
  632             "slt    %[count1],  $zero,      %[qc1]  \n\t"
  633             "slt    %[t1],      $zero,      %[qc2]  \n\t"
  634             "slt    %[count2],  $zero,      %[qc3]  \n\t"
  635             "slt    %[t2],      $zero,      %[qc4]  \n\t"
  636             "addu   %[count1],  %[count1],  %[t1]   \n\t"
  637             "addu   %[count2],  %[count2],  %[t2]   \n\t"
  638 
  639             ".set pop                               \n\t"
  640 
  641             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  642               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  643               [sign1]"=&r"(sign1), [count1]"=&r"(count1),
  644               [sign2]"=&r"(sign2), [count2]"=&r"(count2),
  645               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  646               [t4]"=&r"(t4)
  647             : [in_int]"r"(in_int)
  648             : "memory"
  649         );
  650 
  651         curidx1  = 13 * qc1;
  652         curidx1 += qc2;
  653 
  654         v_codes = (p_codes[curidx1] << count1) | sign1;
  655         v_bits  = p_bits[curidx1] + count1;
  656         put_bits(pb, v_bits, v_codes);
  657 
  658         curidx2  = 13 * qc3;
  659         curidx2 += qc4;
  660 
  661         v_codes = (p_codes[curidx2] << count2) | sign2;
  662         v_bits  = p_bits[curidx2] + count2;
  663         put_bits(pb, v_bits, v_codes);
  664 
  665         if (out || energy) {
  666             float e1,e2,e3,e4;
  667             vec1 = &p_vec[curidx1*2];
  668             vec2 = &p_vec[curidx2*2];
  669             e1 = copysignf(vec1[0] * IQ, in[i+0]);
  670             e2 = copysignf(vec1[1] * IQ, in[i+1]);
  671             e3 = copysignf(vec2[0] * IQ, in[i+2]);
  672             e4 = copysignf(vec2[1] * IQ, in[i+3]);
  673             if (out) {
  674                 out[i+0] = e1;
  675                 out[i+1] = e2;
  676                 out[i+2] = e3;
  677                 out[i+3] = e4;
  678             }
  679             if (energy)
  680                 qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
  681         }
  682     }
  683     if (energy)
  684         *energy = qenergy;
  685 }
  686 
  687 static void quantize_and_encode_band_cost_ESC_mips(struct AACEncContext *s,
  688                                                    PutBitContext *pb, const float *in, float *out,
  689                                                    const float *scaled, int size, int scale_idx,
  690                                                    int cb, const float lambda, const float uplim,
  691                                                    int *bits, float *energy, const float ROUNDING)
  692 {
  693     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
  694     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
  695     int i;
  696     int qc1, qc2, qc3, qc4;
  697     float qenergy = 0.0f;
  698 
  699     uint8_t  *p_bits    = (uint8_t* )ff_aac_spectral_bits[cb-1];
  700     uint16_t *p_codes   = (uint16_t*)ff_aac_spectral_codes[cb-1];
  701     float    *p_vectors = (float*   )ff_aac_codebook_vectors[cb-1];
  702 
  703     abs_pow34_v(s->scoefs, in, size);
  704     scaled = s->scoefs;
  705 
  706     if (cb < 11) {
  707         for (i = 0; i < size; i += 4) {
  708             int curidx, curidx2, sign1, count1, sign2, count2;
  709             int *in_int = (int *)&in[i];
  710             uint8_t v_bits;
  711             unsigned int v_codes;
  712             int t0, t1, t2, t3, t4;
  713             const float *vec1, *vec2;
  714 
  715             qc1 = scaled[i  ] * Q34 + ROUNDING;
  716             qc2 = scaled[i+1] * Q34 + ROUNDING;
  717             qc3 = scaled[i+2] * Q34 + ROUNDING;
  718             qc4 = scaled[i+3] * Q34 + ROUNDING;
  719 
  720             __asm__ volatile (
  721                 ".set push                                  \n\t"
  722                 ".set noreorder                             \n\t"
  723 
  724                 "ori        %[t4],      $zero,      16      \n\t"
  725                 "ori        %[sign1],   $zero,      0       \n\t"
  726                 "ori        %[sign2],   $zero,      0       \n\t"
  727                 "slt        %[t0],      %[t4],      %[qc1]  \n\t"
  728                 "slt        %[t1],      %[t4],      %[qc2]  \n\t"
  729                 "slt        %[t2],      %[t4],      %[qc3]  \n\t"
  730                 "slt        %[t3],      %[t4],      %[qc4]  \n\t"
  731                 "movn       %[qc1],     %[t4],      %[t0]   \n\t"
  732                 "movn       %[qc2],     %[t4],      %[t1]   \n\t"
  733                 "movn       %[qc3],     %[t4],      %[t2]   \n\t"
  734                 "movn       %[qc4],     %[t4],      %[t3]   \n\t"
  735                 "lw         %[t0],      0(%[in_int])        \n\t"
  736                 "lw         %[t1],      4(%[in_int])        \n\t"
  737                 "lw         %[t2],      8(%[in_int])        \n\t"
  738                 "lw         %[t3],      12(%[in_int])       \n\t"
  739                 "slt        %[t0],      %[t0],      $zero   \n\t"
  740                 "movn       %[sign1],   %[t0],      %[qc1]  \n\t"
  741                 "slt        %[t2],      %[t2],      $zero   \n\t"
  742                 "movn       %[sign2],   %[t2],      %[qc3]  \n\t"
  743                 "slt        %[t1],      %[t1],      $zero   \n\t"
  744                 "sll        %[t0],      %[sign1],   1       \n\t"
  745                 "or         %[t0],      %[t0],      %[t1]   \n\t"
  746                 "movn       %[sign1],   %[t0],      %[qc2]  \n\t"
  747                 "slt        %[t3],      %[t3],      $zero   \n\t"
  748                 "sll        %[t0],      %[sign2],   1       \n\t"
  749                 "or         %[t0],      %[t0],      %[t3]   \n\t"
  750                 "movn       %[sign2],   %[t0],      %[qc4]  \n\t"
  751                 "slt        %[count1],  $zero,      %[qc1]  \n\t"
  752                 "slt        %[t1],      $zero,      %[qc2]  \n\t"
  753                 "slt        %[count2],  $zero,      %[qc3]  \n\t"
  754                 "slt        %[t2],      $zero,      %[qc4]  \n\t"
  755                 "addu       %[count1],  %[count1],  %[t1]   \n\t"
  756                 "addu       %[count2],  %[count2],  %[t2]   \n\t"
  757 
  758                 ".set pop                                   \n\t"
  759 
  760                 : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  761                   [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  762                   [sign1]"=&r"(sign1), [count1]"=&r"(count1),
  763                   [sign2]"=&r"(sign2), [count2]"=&r"(count2),
  764                   [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  765                   [t4]"=&r"(t4)
  766                 : [in_int]"r"(in_int)
  767                 : "memory"
  768             );
  769 
  770             curidx = 17 * qc1;
  771             curidx += qc2;
  772             curidx2 = 17 * qc3;
  773             curidx2 += qc4;
  774 
  775             v_codes = (p_codes[curidx] << count1) | sign1;
  776             v_bits  = p_bits[curidx] + count1;
  777             put_bits(pb, v_bits, v_codes);
  778 
  779             v_codes = (p_codes[curidx2] << count2) | sign2;
  780             v_bits  = p_bits[curidx2] + count2;
  781             put_bits(pb, v_bits, v_codes);
  782 
  783             if (out || energy) {
  784                 float e1,e2,e3,e4;
  785                 vec1 = &p_vectors[curidx*2 ];
  786                 vec2 = &p_vectors[curidx2*2];
  787                 e1 = copysignf(vec1[0] * IQ, in[i+0]);
  788                 e2 = copysignf(vec1[1] * IQ, in[i+1]);
  789                 e3 = copysignf(vec2[0] * IQ, in[i+2]);
  790                 e4 = copysignf(vec2[1] * IQ, in[i+3]);
  791                 if (out) {
  792                     out[i+0] = e1;
  793                     out[i+1] = e2;
  794                     out[i+2] = e3;
  795                     out[i+3] = e4;
  796                 }
  797                 if (energy)
  798                     qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
  799             }
  800         }
  801     } else {
  802         for (i = 0; i < size; i += 4) {
  803             int curidx, curidx2, sign1, count1, sign2, count2;
  804             int *in_int = (int *)&in[i];
  805             uint8_t v_bits;
  806             unsigned int v_codes;
  807             int c1, c2, c3, c4;
  808             int t0, t1, t2, t3, t4;
  809 
  810             qc1 = scaled[i  ] * Q34 + ROUNDING;
  811             qc2 = scaled[i+1] * Q34 + ROUNDING;
  812             qc3 = scaled[i+2] * Q34 + ROUNDING;
  813             qc4 = scaled[i+3] * Q34 + ROUNDING;
  814 
  815             __asm__ volatile (
  816                 ".set push                                  \n\t"
  817                 ".set noreorder                             \n\t"
  818 
  819                 "ori        %[t4],      $zero,      16      \n\t"
  820                 "ori        %[sign1],   $zero,      0       \n\t"
  821                 "ori        %[sign2],   $zero,      0       \n\t"
  822                 "shll_s.w   %[c1],      %[qc1],     18      \n\t"
  823                 "shll_s.w   %[c2],      %[qc2],     18      \n\t"
  824                 "shll_s.w   %[c3],      %[qc3],     18      \n\t"
  825                 "shll_s.w   %[c4],      %[qc4],     18      \n\t"
  826                 "srl        %[c1],      %[c1],      18      \n\t"
  827                 "srl        %[c2],      %[c2],      18      \n\t"
  828                 "srl        %[c3],      %[c3],      18      \n\t"
  829                 "srl        %[c4],      %[c4],      18      \n\t"
  830                 "slt        %[t0],      %[t4],      %[qc1]  \n\t"
  831                 "slt        %[t1],      %[t4],      %[qc2]  \n\t"
  832                 "slt        %[t2],      %[t4],      %[qc3]  \n\t"
  833                 "slt        %[t3],      %[t4],      %[qc4]  \n\t"
  834                 "movn       %[qc1],     %[t4],      %[t0]   \n\t"
  835                 "movn       %[qc2],     %[t4],      %[t1]   \n\t"
  836                 "movn       %[qc3],     %[t4],      %[t2]   \n\t"
  837                 "movn       %[qc4],     %[t4],      %[t3]   \n\t"
  838                 "lw         %[t0],      0(%[in_int])        \n\t"
  839                 "lw         %[t1],      4(%[in_int])        \n\t"
  840                 "lw         %[t2],      8(%[in_int])        \n\t"
  841                 "lw         %[t3],      12(%[in_int])       \n\t"
  842                 "slt        %[t0],      %[t0],      $zero   \n\t"
  843                 "movn       %[sign1],   %[t0],      %[qc1]  \n\t"
  844                 "slt        %[t2],      %[t2],      $zero   \n\t"
  845                 "movn       %[sign2],   %[t2],      %[qc3]  \n\t"
  846                 "slt        %[t1],      %[t1],      $zero   \n\t"
  847                 "sll        %[t0],      %[sign1],   1       \n\t"
  848                 "or         %[t0],      %[t0],      %[t1]   \n\t"
  849                 "movn       %[sign1],   %[t0],      %[qc2]  \n\t"
  850                 "slt        %[t3],      %[t3],      $zero   \n\t"
  851                 "sll        %[t0],      %[sign2],   1       \n\t"
  852                 "or         %[t0],      %[t0],      %[t3]   \n\t"
  853                 "movn       %[sign2],   %[t0],      %[qc4]  \n\t"
  854                 "slt        %[count1],  $zero,      %[qc1]  \n\t"
  855                 "slt        %[t1],      $zero,      %[qc2]  \n\t"
  856                 "slt        %[count2],  $zero,      %[qc3]  \n\t"
  857                 "slt        %[t2],      $zero,      %[qc4]  \n\t"
  858                 "addu       %[count1],  %[count1],  %[t1]   \n\t"
  859                 "addu       %[count2],  %[count2],  %[t2]   \n\t"
  860 
  861                 ".set pop                                   \n\t"
  862 
  863                 : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
  864                   [qc3]"+r"(qc3), [qc4]"+r"(qc4),
  865                   [sign1]"=&r"(sign1), [count1]"=&r"(count1),
  866                   [sign2]"=&r"(sign2), [count2]"=&r"(count2),
  867                   [c1]"=&r"(c1), [c2]"=&r"(c2),
  868                   [c3]"=&r"(c3), [c4]"=&r"(c4),
  869                   [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
  870                   [t4]"=&r"(t4)
  871                 : [in_int]"r"(in_int)
  872                 : "memory"
  873             );
  874 
  875             curidx = 17 * qc1;
  876             curidx += qc2;
  877 
  878             curidx2 = 17 * qc3;
  879             curidx2 += qc4;
  880 
  881             v_codes = (p_codes[curidx] << count1) | sign1;
  882             v_bits  = p_bits[curidx] + count1;
  883             put_bits(pb, v_bits, v_codes);
  884 
  885             if (p_vectors[curidx*2  ] == 64.0f) {
  886                 int len = av_log2(c1);
  887                 v_codes = (((1 << (len - 3)) - 2) << len) | (c1 & ((1 << len) - 1));
  888                 put_bits(pb, len * 2 - 3, v_codes);
  889             }
  890             if (p_vectors[curidx*2+1] == 64.0f) {
  891                 int len = av_log2(c2);
  892                 v_codes = (((1 << (len - 3)) - 2) << len) | (c2 & ((1 << len) - 1));
  893                 put_bits(pb, len*2-3, v_codes);
  894             }
  895 
  896             v_codes = (p_codes[curidx2] << count2) | sign2;
  897             v_bits  = p_bits[curidx2] + count2;
  898             put_bits(pb, v_bits, v_codes);
  899 
  900             if (p_vectors[curidx2*2  ] == 64.0f) {
  901                 int len = av_log2(c3);
  902                 v_codes = (((1 << (len - 3)) - 2) << len) | (c3 & ((1 << len) - 1));
  903                 put_bits(pb, len* 2 - 3, v_codes);
  904             }
  905             if (p_vectors[curidx2*2+1] == 64.0f) {
  906                 int len = av_log2(c4);
  907                 v_codes = (((1 << (len - 3)) - 2) << len) | (c4 & ((1 << len) - 1));
  908                 put_bits(pb, len * 2 - 3, v_codes);
  909             }
  910 
  911             if (out || energy) {
  912                 float e1, e2, e3, e4;
  913                 e1 = copysignf(c1 * cbrtf(c1) * IQ, in[i+0]);
  914                 e2 = copysignf(c2 * cbrtf(c2) * IQ, in[i+1]);
  915                 e3 = copysignf(c3 * cbrtf(c3) * IQ, in[i+2]);
  916                 e4 = copysignf(c4 * cbrtf(c4) * IQ, in[i+3]);
  917                 if (out) {
  918                     out[i+0] = e1;
  919                     out[i+1] = e2;
  920                     out[i+2] = e3;
  921                     out[i+3] = e4;
  922                 }
  923                 if (energy)
  924                     qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
  925             }
  926         }
  927     }
  928     if (energy)
  929         *energy = qenergy;
  930 }
  931 
  932 static void quantize_and_encode_band_cost_NONE_mips(struct AACEncContext *s,
  933                                                          PutBitContext *pb, const float *in, float *out,
  934                                                          const float *scaled, int size, int scale_idx,
  935                                                          int cb, const float lambda, const float uplim,
  936                                                          int *bits, float *energy, const float ROUNDING) {
  937     av_assert0(0);
  938 }
  939 
  940 static void quantize_and_encode_band_cost_ZERO_mips(struct AACEncContext *s,
  941                                                          PutBitContext *pb, const float *in, float *out,
  942                                                          const float *scaled, int size, int scale_idx,
  943                                                          int cb, const float lambda, const float uplim,
  944                                                          int *bits, float *energy, const float ROUNDING) {
  945     int i;
  946     if (bits)
  947         *bits = 0;
  948     if (out) {
  949         for (i = 0; i < size; i += 4) {
  950            out[i  ] = 0.0f;
  951            out[i+1] = 0.0f;
  952            out[i+2] = 0.0f;
  953            out[i+3] = 0.0f;
  954         }
  955     }
  956     if (energy)
  957         *energy = 0.0f;
  958 }
  959 
  960 static void (*const quantize_and_encode_band_cost_arr[])(struct AACEncContext *s,
  961                                                          PutBitContext *pb, const float *in, float *out,
  962                                                          const float *scaled, int size, int scale_idx,
  963                                                          int cb, const float lambda, const float uplim,
  964                                                          int *bits, float *energy, const float ROUNDING) = {
  965     quantize_and_encode_band_cost_ZERO_mips,
  966     quantize_and_encode_band_cost_SQUAD_mips,
  967     quantize_and_encode_band_cost_SQUAD_mips,
  968     quantize_and_encode_band_cost_UQUAD_mips,
  969     quantize_and_encode_band_cost_UQUAD_mips,
  970     quantize_and_encode_band_cost_SPAIR_mips,
  971     quantize_and_encode_band_cost_SPAIR_mips,
  972     quantize_and_encode_band_cost_UPAIR7_mips,
  973     quantize_and_encode_band_cost_UPAIR7_mips,
  974     quantize_and_encode_band_cost_UPAIR12_mips,
  975     quantize_and_encode_band_cost_UPAIR12_mips,
  976     quantize_and_encode_band_cost_ESC_mips,
  977     quantize_and_encode_band_cost_NONE_mips, /* cb 12 doesn't exist */
  978     quantize_and_encode_band_cost_ZERO_mips,
  979     quantize_and_encode_band_cost_ZERO_mips,
  980     quantize_and_encode_band_cost_ZERO_mips,
  981 };
  982 
  983 #define quantize_and_encode_band_cost(                                       \
  984                                 s, pb, in, out, scaled, size, scale_idx, cb, \
  985                                 lambda, uplim, bits, energy, ROUNDING)       \
  986     quantize_and_encode_band_cost_arr[cb](                                   \
  987                                 s, pb, in, out, scaled, size, scale_idx, cb, \
  988                                 lambda, uplim, bits, energy, ROUNDING)
  989 
  990 static void quantize_and_encode_band_mips(struct AACEncContext *s, PutBitContext *pb,
  991                                           const float *in, float *out, int size, int scale_idx,
  992                                           int cb, const float lambda, int rtz)
  993 {
  994     quantize_and_encode_band_cost(s, pb, in, out, NULL, size, scale_idx, cb, lambda,
  995                                   INFINITY, NULL, NULL, (rtz) ? ROUND_TO_ZERO : ROUND_STANDARD);
  996 }
  997 
  998 /**
  999  * Functions developed from template function and optimized for getting the number of bits
 1000  */
 1001 static float get_band_numbits_ZERO_mips(struct AACEncContext *s,
 1002                                         PutBitContext *pb, const float *in,
 1003                                         const float *scaled, int size, int scale_idx,
 1004                                         int cb, const float lambda, const float uplim,
 1005                                         int *bits)
 1006 {
 1007     return 0;
 1008 }
 1009 
 1010 static float get_band_numbits_NONE_mips(struct AACEncContext *s,
 1011                                         PutBitContext *pb, const float *in,
 1012                                         const float *scaled, int size, int scale_idx,
 1013                                         int cb, const float lambda, const float uplim,
 1014                                         int *bits)
 1015 {
 1016     av_assert0(0);
 1017     return 0;
 1018 }
 1019 
 1020 static float get_band_numbits_SQUAD_mips(struct AACEncContext *s,
 1021                                          PutBitContext *pb, const float *in,
 1022                                          const float *scaled, int size, int scale_idx,
 1023                                          int cb, const float lambda, const float uplim,
 1024                                          int *bits)
 1025 {
 1026     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
 1027     int i;
 1028     int qc1, qc2, qc3, qc4;
 1029     int curbits = 0;
 1030 
 1031     uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
 1032 
 1033     for (i = 0; i < size; i += 4) {
 1034         int curidx;
 1035         int *in_int = (int *)&in[i];
 1036         int t0, t1, t2, t3, t4, t5, t6, t7;
 1037 
 1038         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
 1039         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
 1040         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
 1041         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
 1042 
 1043         __asm__ volatile (
 1044             ".set push                      \n\t"
 1045             ".set noreorder                 \n\t"
 1046 
 1047             "slt    %[qc1], $zero,  %[qc1]  \n\t"
 1048             "slt    %[qc2], $zero,  %[qc2]  \n\t"
 1049             "slt    %[qc3], $zero,  %[qc3]  \n\t"
 1050             "slt    %[qc4], $zero,  %[qc4]  \n\t"
 1051             "lw     %[t0],  0(%[in_int])    \n\t"
 1052             "lw     %[t1],  4(%[in_int])    \n\t"
 1053             "lw     %[t2],  8(%[in_int])    \n\t"
 1054             "lw     %[t3],  12(%[in_int])   \n\t"
 1055             "srl    %[t0],  %[t0],  31      \n\t"
 1056             "srl    %[t1],  %[t1],  31      \n\t"
 1057             "srl    %[t2],  %[t2],  31      \n\t"
 1058             "srl    %[t3],  %[t3],  31      \n\t"
 1059             "subu   %[t4],  $zero,  %[qc1]  \n\t"
 1060             "subu   %[t5],  $zero,  %[qc2]  \n\t"
 1061             "subu   %[t6],  $zero,  %[qc3]  \n\t"
 1062             "subu   %[t7],  $zero,  %[qc4]  \n\t"
 1063             "movn   %[qc1], %[t4],  %[t0]   \n\t"
 1064             "movn   %[qc2], %[t5],  %[t1]   \n\t"
 1065             "movn   %[qc3], %[t6],  %[t2]   \n\t"
 1066             "movn   %[qc4], %[t7],  %[t3]   \n\t"
 1067 
 1068             ".set pop                       \n\t"
 1069 
 1070             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
 1071               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
 1072               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
 1073               [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
 1074             : [in_int]"r"(in_int)
 1075             : "memory"
 1076         );
 1077 
 1078         curidx = qc1;
 1079         curidx *= 3;
 1080         curidx += qc2;
 1081         curidx *= 3;
 1082         curidx += qc3;
 1083         curidx *= 3;
 1084         curidx += qc4;
 1085         curidx += 40;
 1086 
 1087         curbits += p_bits[curidx];
 1088     }
 1089     return curbits;
 1090 }
 1091 
 1092 static float get_band_numbits_UQUAD_mips(struct AACEncContext *s,
 1093                                          PutBitContext *pb, const float *in,
 1094                                          const float *scaled, int size, int scale_idx,
 1095                                          int cb, const float lambda, const float uplim,
 1096                                          int *bits)
 1097 {
 1098     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
 1099     int i;
 1100     int curbits = 0;
 1101     int qc1, qc2, qc3, qc4;
 1102 
 1103     uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
 1104 
 1105     for (i = 0; i < size; i += 4) {
 1106         int curidx;
 1107         int t0, t1, t2, t3, t4;
 1108 
 1109         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
 1110         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
 1111         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
 1112         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
 1113 
 1114         __asm__ volatile (
 1115             ".set push                      \n\t"
 1116             ".set noreorder                 \n\t"
 1117 
 1118             "ori    %[t4],  $zero,  2       \n\t"
 1119             "slt    %[t0],  %[t4],  %[qc1]  \n\t"
 1120             "slt    %[t1],  %[t4],  %[qc2]  \n\t"
 1121             "slt    %[t2],  %[t4],  %[qc3]  \n\t"
 1122             "slt    %[t3],  %[t4],  %[qc4]  \n\t"
 1123             "movn   %[qc1], %[t4],  %[t0]   \n\t"
 1124             "movn   %[qc2], %[t4],  %[t1]   \n\t"
 1125             "movn   %[qc3], %[t4],  %[t2]   \n\t"
 1126             "movn   %[qc4], %[t4],  %[t3]   \n\t"
 1127 
 1128             ".set pop                       \n\t"
 1129 
 1130             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
 1131               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
 1132               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
 1133               [t4]"=&r"(t4)
 1134         );
 1135 
 1136         curidx = qc1;
 1137         curidx *= 3;
 1138         curidx += qc2;
 1139         curidx *= 3;
 1140         curidx += qc3;
 1141         curidx *= 3;
 1142         curidx += qc4;
 1143 
 1144         curbits += p_bits[curidx];
 1145         curbits += uquad_sign_bits[curidx];
 1146     }
 1147     return curbits;
 1148 }
 1149 
 1150 static float get_band_numbits_SPAIR_mips(struct AACEncContext *s,
 1151                                          PutBitContext *pb, const float *in,
 1152                                          const float *scaled, int size, int scale_idx,
 1153                                          int cb, const float lambda, const float uplim,
 1154                                          int *bits)
 1155 {
 1156     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
 1157     int i;
 1158     int qc1, qc2, qc3, qc4;
 1159     int curbits = 0;
 1160 
 1161     uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
 1162 
 1163     for (i = 0; i < size; i += 4) {
 1164         int curidx, curidx2;
 1165         int *in_int = (int *)&in[i];
 1166         int t0, t1, t2, t3, t4, t5, t6, t7;
 1167 
 1168         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
 1169         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
 1170         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
 1171         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
 1172 
 1173         __asm__ volatile (
 1174             ".set push                      \n\t"
 1175             ".set noreorder                 \n\t"
 1176 
 1177             "ori    %[t4],  $zero,  4       \n\t"
 1178             "slt    %[t0],  %[t4],  %[qc1]  \n\t"
 1179             "slt    %[t1],  %[t4],  %[qc2]  \n\t"
 1180             "slt    %[t2],  %[t4],  %[qc3]  \n\t"
 1181             "slt    %[t3],  %[t4],  %[qc4]  \n\t"
 1182             "movn   %[qc1], %[t4],  %[t0]   \n\t"
 1183             "movn   %[qc2], %[t4],  %[t1]   \n\t"
 1184             "movn   %[qc3], %[t4],  %[t2]   \n\t"
 1185             "movn   %[qc4], %[t4],  %[t3]   \n\t"
 1186             "lw     %[t0],  0(%[in_int])    \n\t"
 1187             "lw     %[t1],  4(%[in_int])    \n\t"
 1188             "lw     %[t2],  8(%[in_int])    \n\t"
 1189             "lw     %[t3],  12(%[in_int])   \n\t"
 1190             "srl    %[t0],  %[t0],  31      \n\t"
 1191             "srl    %[t1],  %[t1],  31      \n\t"
 1192             "srl    %[t2],  %[t2],  31      \n\t"
 1193             "srl    %[t3],  %[t3],  31      \n\t"
 1194             "subu   %[t4],  $zero,  %[qc1]  \n\t"
 1195             "subu   %[t5],  $zero,  %[qc2]  \n\t"
 1196             "subu   %[t6],  $zero,  %[qc3]  \n\t"
 1197             "subu   %[t7],  $zero,  %[qc4]  \n\t"
 1198             "movn   %[qc1], %[t4],  %[t0]   \n\t"
 1199             "movn   %[qc2], %[t5],  %[t1]   \n\t"
 1200             "movn   %[qc3], %[t6],  %[t2]   \n\t"
 1201             "movn   %[qc4], %[t7],  %[t3]   \n\t"
 1202 
 1203             ".set pop                       \n\t"
 1204 
 1205             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
 1206               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
 1207               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
 1208               [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
 1209             : [in_int]"r"(in_int)
 1210             : "memory"
 1211         );
 1212 
 1213         curidx  = 9 * qc1;
 1214         curidx += qc2 + 40;
 1215 
 1216         curidx2  = 9 * qc3;
 1217         curidx2 += qc4 + 40;
 1218 
 1219         curbits += p_bits[curidx] + p_bits[curidx2];
 1220     }
 1221     return curbits;
 1222 }
 1223 
 1224 static float get_band_numbits_UPAIR7_mips(struct AACEncContext *s,
 1225                                           PutBitContext *pb, const float *in,
 1226                                           const float *scaled, int size, int scale_idx,
 1227                                           int cb, const float lambda, const float uplim,
 1228                                           int *bits)
 1229 {
 1230     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
 1231     int i;
 1232     int qc1, qc2, qc3, qc4;
 1233     int curbits = 0;
 1234 
 1235     uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
 1236 
 1237     for (i = 0; i < size; i += 4) {
 1238         int curidx, curidx2;
 1239         int t0, t1, t2, t3, t4;
 1240 
 1241         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
 1242         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
 1243         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
 1244         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
 1245 
 1246         __asm__ volatile (
 1247             ".set push                      \n\t"
 1248             ".set noreorder                 \n\t"
 1249 
 1250             "ori    %[t4],  $zero,  7       \n\t"
 1251             "slt    %[t0],  %[t4],  %[qc1]  \n\t"
 1252             "slt    %[t1],  %[t4],  %[qc2]  \n\t"
 1253             "slt    %[t2],  %[t4],  %[qc3]  \n\t"
 1254             "slt    %[t3],  %[t4],  %[qc4]  \n\t"
 1255             "movn   %[qc1], %[t4],  %[t0]   \n\t"
 1256             "movn   %[qc2], %[t4],  %[t1]   \n\t"
 1257             "movn   %[qc3], %[t4],  %[t2]   \n\t"
 1258             "movn   %[qc4], %[t4],  %[t3]   \n\t"
 1259 
 1260             ".set pop                       \n\t"
 1261 
 1262             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
 1263               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
 1264               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
 1265               [t4]"=&r"(t4)
 1266         );
 1267 
 1268         curidx  = 8 * qc1;
 1269         curidx += qc2;
 1270 
 1271         curidx2  = 8 * qc3;
 1272         curidx2 += qc4;
 1273 
 1274         curbits += p_bits[curidx] +
 1275                    upair7_sign_bits[curidx] +
 1276                    p_bits[curidx2] +
 1277                    upair7_sign_bits[curidx2];
 1278     }
 1279     return curbits;
 1280 }
 1281 
 1282 static float get_band_numbits_UPAIR12_mips(struct AACEncContext *s,
 1283                                            PutBitContext *pb, const float *in,
 1284                                            const float *scaled, int size, int scale_idx,
 1285                                            int cb, const float lambda, const float uplim,
 1286                                            int *bits)
 1287 {
 1288     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
 1289     int i;
 1290     int qc1, qc2, qc3, qc4;
 1291     int curbits = 0;
 1292 
 1293     uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
 1294 
 1295     for (i = 0; i < size; i += 4) {
 1296         int curidx, curidx2;
 1297         int t0, t1, t2, t3, t4;
 1298 
 1299         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
 1300         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
 1301         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
 1302         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
 1303 
 1304         __asm__ volatile (
 1305             ".set push                      \n\t"
 1306             ".set noreorder                 \n\t"
 1307 
 1308             "ori    %[t4],  $zero,  12      \n\t"
 1309             "slt    %[t0],  %[t4],  %[qc1]  \n\t"
 1310             "slt    %[t1],  %[t4],  %[qc2]  \n\t"
 1311             "slt    %[t2],  %[t4],  %[qc3]  \n\t"
 1312             "slt    %[t3],  %[t4],  %[qc4]  \n\t"
 1313             "movn   %[qc1], %[t4],  %[t0]   \n\t"
 1314             "movn   %[qc2], %[t4],  %[t1]   \n\t"
 1315             "movn   %[qc3], %[t4],  %[t2]   \n\t"
 1316             "movn   %[qc4], %[t4],  %[t3]   \n\t"
 1317 
 1318             ".set pop                       \n\t"
 1319 
 1320             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
 1321               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
 1322               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
 1323               [t4]"=&r"(t4)
 1324         );
 1325 
 1326         curidx  = 13 * qc1;
 1327         curidx += qc2;
 1328 
 1329         curidx2  = 13 * qc3;
 1330         curidx2 += qc4;
 1331 
 1332         curbits += p_bits[curidx] +
 1333                    p_bits[curidx2] +
 1334                    upair12_sign_bits[curidx] +
 1335                    upair12_sign_bits[curidx2];
 1336     }
 1337     return curbits;
 1338 }
 1339 
 1340 static float get_band_numbits_ESC_mips(struct AACEncContext *s,
 1341                                        PutBitContext *pb, const float *in,
 1342                                        const float *scaled, int size, int scale_idx,
 1343                                        int cb, const float lambda, const float uplim,
 1344                                        int *bits)
 1345 {
 1346     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
 1347     int i;
 1348     int qc1, qc2, qc3, qc4;
 1349     int curbits = 0;
 1350 
 1351     uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
 1352 
 1353     for (i = 0; i < size; i += 4) {
 1354         int curidx, curidx2;
 1355         int cond0, cond1, cond2, cond3;
 1356         int c1, c2, c3, c4;
 1357         int t4, t5;
 1358 
 1359         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
 1360         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
 1361         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
 1362         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
 1363 
 1364         __asm__ volatile (
 1365             ".set push                                  \n\t"
 1366             ".set noreorder                             \n\t"
 1367 
 1368             "ori        %[t4],      $zero,  15          \n\t"
 1369             "ori        %[t5],      $zero,  16          \n\t"
 1370             "shll_s.w   %[c1],      %[qc1], 18          \n\t"
 1371             "shll_s.w   %[c2],      %[qc2], 18          \n\t"
 1372             "shll_s.w   %[c3],      %[qc3], 18          \n\t"
 1373             "shll_s.w   %[c4],      %[qc4], 18          \n\t"
 1374             "srl        %[c1],      %[c1],  18          \n\t"
 1375             "srl        %[c2],      %[c2],  18          \n\t"
 1376             "srl        %[c3],      %[c3],  18          \n\t"
 1377             "srl        %[c4],      %[c4],  18          \n\t"
 1378             "slt        %[cond0],   %[t4],  %[qc1]      \n\t"
 1379             "slt        %[cond1],   %[t4],  %[qc2]      \n\t"
 1380             "slt        %[cond2],   %[t4],  %[qc3]      \n\t"
 1381             "slt        %[cond3],   %[t4],  %[qc4]      \n\t"
 1382             "movn       %[qc1],     %[t5],  %[cond0]    \n\t"
 1383             "movn       %[qc2],     %[t5],  %[cond1]    \n\t"
 1384             "movn       %[qc3],     %[t5],  %[cond2]    \n\t"
 1385             "movn       %[qc4],     %[t5],  %[cond3]    \n\t"
 1386             "ori        %[t5],      $zero,  31          \n\t"
 1387             "clz        %[c1],      %[c1]               \n\t"
 1388             "clz        %[c2],      %[c2]               \n\t"
 1389             "clz        %[c3],      %[c3]               \n\t"
 1390             "clz        %[c4],      %[c4]               \n\t"
 1391             "subu       %[c1],      %[t5],  %[c1]       \n\t"
 1392             "subu       %[c2],      %[t5],  %[c2]       \n\t"
 1393             "subu       %[c3],      %[t5],  %[c3]       \n\t"
 1394             "subu       %[c4],      %[t5],  %[c4]       \n\t"
 1395             "sll        %[c1],      %[c1],  1           \n\t"
 1396             "sll        %[c2],      %[c2],  1           \n\t"
 1397             "sll        %[c3],      %[c3],  1           \n\t"
 1398             "sll        %[c4],      %[c4],  1           \n\t"
 1399             "addiu      %[c1],      %[c1],  -3          \n\t"
 1400             "addiu      %[c2],      %[c2],  -3          \n\t"
 1401             "addiu      %[c3],      %[c3],  -3          \n\t"
 1402             "addiu      %[c4],      %[c4],  -3          \n\t"
 1403             "subu       %[cond0],   $zero,  %[cond0]    \n\t"
 1404             "subu       %[cond1],   $zero,  %[cond1]    \n\t"
 1405             "subu       %[cond2],   $zero,  %[cond2]    \n\t"
 1406             "subu       %[cond3],   $zero,  %[cond3]    \n\t"
 1407             "and        %[c1],      %[c1],  %[cond0]    \n\t"
 1408             "and        %[c2],      %[c2],  %[cond1]    \n\t"
 1409             "and        %[c3],      %[c3],  %[cond2]    \n\t"
 1410             "and        %[c4],      %[c4],  %[cond3]    \n\t"
 1411 
 1412             ".set pop                                   \n\t"
 1413 
 1414             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
 1415               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
 1416               [cond0]"=&r"(cond0), [cond1]"=&r"(cond1),
 1417               [cond2]"=&r"(cond2), [cond3]"=&r"(cond3),
 1418               [c1]"=&r"(c1), [c2]"=&r"(c2),
 1419               [c3]"=&r"(c3), [c4]"=&r"(c4),
 1420               [t4]"=&r"(t4), [t5]"=&r"(t5)
 1421         );
 1422 
 1423         curidx = 17 * qc1;
 1424         curidx += qc2;
 1425 
 1426         curidx2 = 17 * qc3;
 1427         curidx2 += qc4;
 1428 
 1429         curbits += p_bits[curidx];
 1430         curbits += esc_sign_bits[curidx];
 1431         curbits += p_bits[curidx2];
 1432         curbits += esc_sign_bits[curidx2];
 1433 
 1434         curbits += c1;
 1435         curbits += c2;
 1436         curbits += c3;
 1437         curbits += c4;
 1438     }
 1439     return curbits;
 1440 }
 1441 
 1442 static float (*const get_band_numbits_arr[])(struct AACEncContext *s,
 1443                                              PutBitContext *pb, const float *in,
 1444                                              const float *scaled, int size, int scale_idx,
 1445                                              int cb, const float lambda, const float uplim,
 1446                                              int *bits) = {
 1447     get_band_numbits_ZERO_mips,
 1448     get_band_numbits_SQUAD_mips,
 1449     get_band_numbits_SQUAD_mips,
 1450     get_band_numbits_UQUAD_mips,
 1451     get_band_numbits_UQUAD_mips,
 1452     get_band_numbits_SPAIR_mips,
 1453     get_band_numbits_SPAIR_mips,
 1454     get_band_numbits_UPAIR7_mips,
 1455     get_band_numbits_UPAIR7_mips,
 1456     get_band_numbits_UPAIR12_mips,
 1457     get_band_numbits_UPAIR12_mips,
 1458     get_band_numbits_ESC_mips,
 1459     get_band_numbits_NONE_mips, /* cb 12 doesn't exist */
 1460     get_band_numbits_ZERO_mips,
 1461     get_band_numbits_ZERO_mips,
 1462     get_band_numbits_ZERO_mips,
 1463 };
 1464 
 1465 #define get_band_numbits(                                  \
 1466                                 s, pb, in, scaled, size, scale_idx, cb, \
 1467                                 lambda, uplim, bits)                    \
 1468     get_band_numbits_arr[cb](                              \
 1469                                 s, pb, in, scaled, size, scale_idx, cb, \
 1470                                 lambda, uplim, bits)
 1471 
 1472 static float quantize_band_cost_bits(struct AACEncContext *s, const float *in,
 1473                                      const float *scaled, int size, int scale_idx,
 1474                                      int cb, const float lambda, const float uplim,
 1475                                      int *bits, float *energy, int rtz)
 1476 {
 1477     return get_band_numbits(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits);
 1478 }
 1479 
 1480 /**
 1481  * Functions developed from template function and optimized for getting the band cost
 1482  */
 1483 #if HAVE_MIPSFPU
 1484 static float get_band_cost_ZERO_mips(struct AACEncContext *s,
 1485                                      PutBitContext *pb, const float *in,
 1486                                      const float *scaled, int size, int scale_idx,
 1487                                      int cb, const float lambda, const float uplim,
 1488                                      int *bits, float *energy)
 1489 {
 1490     int i;
 1491     float cost = 0;
 1492 
 1493     for (i = 0; i < size; i += 4) {
 1494         cost += in[i  ] * in[i  ];
 1495         cost += in[i+1] * in[i+1];
 1496         cost += in[i+2] * in[i+2];
 1497         cost += in[i+3] * in[i+3];
 1498     }
 1499     if (bits)
 1500         *bits = 0;
 1501     if (energy)
 1502         *energy = 0.0f;
 1503     return cost * lambda;
 1504 }
 1505 
 1506 static float get_band_cost_NONE_mips(struct AACEncContext *s,
 1507                                      PutBitContext *pb, const float *in,
 1508                                      const float *scaled, int size, int scale_idx,
 1509                                      int cb, const float lambda, const float uplim,
 1510                                      int *bits, float *energy)
 1511 {
 1512     av_assert0(0);
 1513     return 0;
 1514 }
 1515 
 1516 static float get_band_cost_SQUAD_mips(struct AACEncContext *s,
 1517                                       PutBitContext *pb, const float *in,
 1518                                       const float *scaled, int size, int scale_idx,
 1519                                       int cb, const float lambda, const float uplim,
 1520                                       int *bits, float *energy)
 1521 {
 1522     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
 1523     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
 1524     int i;
 1525     float cost = 0;
 1526     float qenergy = 0.0f;
 1527     int qc1, qc2, qc3, qc4;
 1528     int curbits = 0;
 1529 
 1530     uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
 1531     float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
 1532 
 1533     for (i = 0; i < size; i += 4) {
 1534         const float *vec;
 1535         int curidx;
 1536         int   *in_int = (int   *)&in[i];
 1537         float *in_pos = (float *)&in[i];
 1538         float di0, di1, di2, di3;
 1539         int t0, t1, t2, t3, t4, t5, t6, t7;
 1540 
 1541         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
 1542         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
 1543         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
 1544         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
 1545 
 1546         __asm__ volatile (
 1547             ".set push                                  \n\t"
 1548             ".set noreorder                             \n\t"
 1549 
 1550             "slt        %[qc1], $zero,  %[qc1]          \n\t"
 1551             "slt        %[qc2], $zero,  %[qc2]          \n\t"
 1552             "slt        %[qc3], $zero,  %[qc3]          \n\t"
 1553             "slt        %[qc4], $zero,  %[qc4]          \n\t"
 1554             "lw         %[t0],  0(%[in_int])            \n\t"
 1555             "lw         %[t1],  4(%[in_int])            \n\t"
 1556             "lw         %[t2],  8(%[in_int])            \n\t"
 1557             "lw         %[t3],  12(%[in_int])           \n\t"
 1558             "srl        %[t0],  %[t0],  31              \n\t"
 1559             "srl        %[t1],  %[t1],  31              \n\t"
 1560             "srl        %[t2],  %[t2],  31              \n\t"
 1561             "srl        %[t3],  %[t3],  31              \n\t"
 1562             "subu       %[t4],  $zero,  %[qc1]          \n\t"
 1563             "subu       %[t5],  $zero,  %[qc2]          \n\t"
 1564             "subu       %[t6],  $zero,  %[qc3]          \n\t"
 1565             "subu       %[t7],  $zero,  %[qc4]          \n\t"
 1566             "movn       %[qc1], %[t4],  %[t0]           \n\t"
 1567             "movn       %[qc2], %[t5],  %[t1]           \n\t"
 1568             "movn       %[qc3], %[t6],  %[t2]           \n\t"
 1569             "movn       %[qc4], %[t7],  %[t3]           \n\t"
 1570 
 1571             ".set pop                                   \n\t"
 1572 
 1573             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
 1574               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
 1575               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
 1576               [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
 1577             : [in_int]"r"(in_int)
 1578             : "memory"
 1579         );
 1580 
 1581         curidx = qc1;
 1582         curidx *= 3;
 1583         curidx += qc2;
 1584         curidx *= 3;
 1585         curidx += qc3;
 1586         curidx *= 3;
 1587         curidx += qc4;
 1588         curidx += 40;
 1589 
 1590         curbits += p_bits[curidx];
 1591         vec     = &p_codes[curidx*4];
 1592 
 1593         qenergy += vec[0]*vec[0] + vec[1]*vec[1]
 1594                 +  vec[2]*vec[2] + vec[3]*vec[3];
 1595 
 1596         __asm__ volatile (
 1597             ".set push                                  \n\t"
 1598             ".set noreorder                             \n\t"
 1599 
 1600             "lwc1       $f0,    0(%[in_pos])            \n\t"
 1601             "lwc1       $f1,    0(%[vec])               \n\t"
 1602             "lwc1       $f2,    4(%[in_pos])            \n\t"
 1603             "lwc1       $f3,    4(%[vec])               \n\t"
 1604             "lwc1       $f4,    8(%[in_pos])            \n\t"
 1605             "lwc1       $f5,    8(%[vec])               \n\t"
 1606             "lwc1       $f6,    12(%[in_pos])           \n\t"
 1607             "lwc1       $f7,    12(%[vec])              \n\t"
 1608             "nmsub.s    %[di0], $f0,    $f1,    %[IQ]   \n\t"
 1609             "nmsub.s    %[di1], $f2,    $f3,    %[IQ]   \n\t"
 1610             "nmsub.s    %[di2], $f4,    $f5,    %[IQ]   \n\t"
 1611             "nmsub.s    %[di3], $f6,    $f7,    %[IQ]   \n\t"
 1612 
 1613             ".set pop                                   \n\t"
 1614 
 1615             : [di0]"=&f"(di0), [di1]"=&f"(di1),
 1616               [di2]"=&f"(di2), [di3]"=&f"(di3)
 1617             : [in_pos]"r"(in_pos), [vec]"r"(vec),
 1618               [IQ]"f"(IQ)
 1619             : "$f0", "$f1", "$f2", "$f3",
 1620               "$f4", "$f5", "$f6", "$f7",
 1621               "memory"
 1622         );
 1623 
 1624         cost += di0 * di0 + di1 * di1
 1625                 + di2 * di2 + di3 * di3;
 1626     }
 1627 
 1628     if (bits)
 1629         *bits = curbits;
 1630     if (energy)
 1631         *energy = qenergy * (IQ*IQ);
 1632     return cost * lambda + curbits;
 1633 }
 1634 
 1635 static float get_band_cost_UQUAD_mips(struct AACEncContext *s,
 1636                                       PutBitContext *pb, const float *in,
 1637                                       const float *scaled, int size, int scale_idx,
 1638                                       int cb, const float lambda, const float uplim,
 1639                                       int *bits, float *energy)
 1640 {
 1641     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
 1642     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
 1643     int i;
 1644     float cost = 0;
 1645     float qenergy = 0.0f;
 1646     int curbits = 0;
 1647     int qc1, qc2, qc3, qc4;
 1648 
 1649     uint8_t *p_bits  = (uint8_t*)ff_aac_spectral_bits[cb-1];
 1650     float   *p_codes = (float  *)ff_aac_codebook_vectors[cb-1];
 1651 
 1652     for (i = 0; i < size; i += 4) {
 1653         const float *vec;
 1654         int curidx;
 1655         float *in_pos = (float *)&in[i];
 1656         float di0, di1, di2, di3;
 1657         int t0, t1, t2, t3, t4;
 1658 
 1659         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
 1660         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
 1661         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
 1662         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
 1663 
 1664         __asm__ volatile (
 1665             ".set push                                  \n\t"
 1666             ".set noreorder                             \n\t"
 1667 
 1668             "ori        %[t4],  $zero,  2               \n\t"
 1669             "slt        %[t0],  %[t4],  %[qc1]          \n\t"
 1670             "slt        %[t1],  %[t4],  %[qc2]          \n\t"
 1671             "slt        %[t2],  %[t4],  %[qc3]          \n\t"
 1672             "slt        %[t3],  %[t4],  %[qc4]          \n\t"
 1673             "movn       %[qc1], %[t4],  %[t0]           \n\t"
 1674             "movn       %[qc2], %[t4],  %[t1]           \n\t"
 1675             "movn       %[qc3], %[t4],  %[t2]           \n\t"
 1676             "movn       %[qc4], %[t4],  %[t3]           \n\t"
 1677 
 1678             ".set pop                                   \n\t"
 1679 
 1680             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
 1681               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
 1682               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
 1683               [t4]"=&r"(t4)
 1684         );
 1685 
 1686         curidx = qc1;
 1687         curidx *= 3;
 1688         curidx += qc2;
 1689         curidx *= 3;
 1690         curidx += qc3;
 1691         curidx *= 3;
 1692         curidx += qc4;
 1693 
 1694         curbits += p_bits[curidx];
 1695         curbits += uquad_sign_bits[curidx];
 1696         vec     = &p_codes[curidx*4];
 1697 
 1698         qenergy += vec[0]*vec[0] + vec[1]*vec[1]
 1699                 +  vec[2]*vec[2] + vec[3]*vec[3];
 1700 
 1701         __asm__ volatile (
 1702             ".set push                                  \n\t"
 1703             ".set noreorder                             \n\t"
 1704 
 1705             "lwc1       %[di0], 0(%[in_pos])            \n\t"
 1706             "lwc1       %[di1], 4(%[in_pos])            \n\t"
 1707             "lwc1       %[di2], 8(%[in_pos])            \n\t"
 1708             "lwc1       %[di3], 12(%[in_pos])           \n\t"
 1709             "abs.s      %[di0], %[di0]                  \n\t"
 1710             "abs.s      %[di1], %[di1]                  \n\t"
 1711             "abs.s      %[di2], %[di2]                  \n\t"
 1712             "abs.s      %[di3], %[di3]                  \n\t"
 1713             "lwc1       $f0,    0(%[vec])               \n\t"
 1714             "lwc1       $f1,    4(%[vec])               \n\t"
 1715             "lwc1       $f2,    8(%[vec])               \n\t"
 1716             "lwc1       $f3,    12(%[vec])              \n\t"
 1717             "nmsub.s    %[di0], %[di0], $f0,    %[IQ]   \n\t"
 1718             "nmsub.s    %[di1], %[di1], $f1,    %[IQ]   \n\t"
 1719             "nmsub.s    %[di2], %[di2], $f2,    %[IQ]   \n\t"
 1720             "nmsub.s    %[di3], %[di3], $f3,    %[IQ]   \n\t"
 1721 
 1722             ".set pop                                   \n\t"
 1723 
 1724             : [di0]"=&f"(di0), [di1]"=&f"(di1),
 1725               [di2]"=&f"(di2), [di3]"=&f"(di3)
 1726             : [in_pos]"r"(in_pos), [vec]"r"(vec),
 1727               [IQ]"f"(IQ)
 1728             : "$f0", "$f1", "$f2", "$f3",
 1729               "memory"
 1730         );
 1731 
 1732         cost += di0 * di0 + di1 * di1
 1733                 + di2 * di2 + di3 * di3;
 1734     }
 1735 
 1736     if (bits)
 1737         *bits = curbits;
 1738     if (energy)
 1739         *energy = qenergy * (IQ*IQ);
 1740     return cost * lambda + curbits;
 1741 }
 1742 
 1743 static float get_band_cost_SPAIR_mips(struct AACEncContext *s,
 1744                                       PutBitContext *pb, const float *in,
 1745                                       const float *scaled, int size, int scale_idx,
 1746                                       int cb, const float lambda, const float uplim,
 1747                                       int *bits, float *energy)
 1748 {
 1749     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
 1750     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
 1751     int i;
 1752     float cost = 0;
 1753     float qenergy = 0.0f;
 1754     int qc1, qc2, qc3, qc4;
 1755     int curbits = 0;
 1756 
 1757     uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
 1758     float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
 1759 
 1760     for (i = 0; i < size; i += 4) {
 1761         const float *vec, *vec2;
 1762         int curidx, curidx2;
 1763         int   *in_int = (int   *)&in[i];
 1764         float *in_pos = (float *)&in[i];
 1765         float di0, di1, di2, di3;
 1766         int t0, t1, t2, t3, t4, t5, t6, t7;
 1767 
 1768         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
 1769         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
 1770         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
 1771         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
 1772 
 1773         __asm__ volatile (
 1774             ".set push                                  \n\t"
 1775             ".set noreorder                             \n\t"
 1776 
 1777             "ori        %[t4],  $zero,  4               \n\t"
 1778             "slt        %[t0],  %[t4],  %[qc1]          \n\t"
 1779             "slt        %[t1],  %[t4],  %[qc2]          \n\t"
 1780             "slt        %[t2],  %[t4],  %[qc3]          \n\t"
 1781             "slt        %[t3],  %[t4],  %[qc4]          \n\t"
 1782             "movn       %[qc1], %[t4],  %[t0]           \n\t"
 1783             "movn       %[qc2], %[t4],  %[t1]           \n\t"
 1784             "movn       %[qc3], %[t4],  %[t2]           \n\t"
 1785             "movn       %[qc4], %[t4],  %[t3]           \n\t"
 1786             "lw         %[t0],  0(%[in_int])            \n\t"
 1787             "lw         %[t1],  4(%[in_int])            \n\t"
 1788             "lw         %[t2],  8(%[in_int])            \n\t"
 1789             "lw         %[t3],  12(%[in_int])           \n\t"
 1790             "srl        %[t0],  %[t0],  31              \n\t"
 1791             "srl        %[t1],  %[t1],  31              \n\t"
 1792             "srl        %[t2],  %[t2],  31              \n\t"
 1793             "srl        %[t3],  %[t3],  31              \n\t"
 1794             "subu       %[t4],  $zero,  %[qc1]          \n\t"
 1795             "subu       %[t5],  $zero,  %[qc2]          \n\t"
 1796             "subu       %[t6],  $zero,  %[qc3]          \n\t"
 1797             "subu       %[t7],  $zero,  %[qc4]          \n\t"
 1798             "movn       %[qc1], %[t4],  %[t0]           \n\t"
 1799             "movn       %[qc2], %[t5],  %[t1]           \n\t"
 1800             "movn       %[qc3], %[t6],  %[t2]           \n\t"
 1801             "movn       %[qc4], %[t7],  %[t3]           \n\t"
 1802 
 1803             ".set pop                                   \n\t"
 1804 
 1805             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
 1806               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
 1807               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
 1808               [t4]"=&r"(t4), [t5]"=&r"(t5), [t6]"=&r"(t6), [t7]"=&r"(t7)
 1809             : [in_int]"r"(in_int)
 1810             : "memory"
 1811         );
 1812 
 1813         curidx = 9 * qc1;
 1814         curidx += qc2 + 40;
 1815 
 1816         curidx2 = 9 * qc3;
 1817         curidx2 += qc4 + 40;
 1818 
 1819         curbits += p_bits[curidx];
 1820         curbits += p_bits[curidx2];
 1821 
 1822         vec     = &p_codes[curidx*2];
 1823         vec2    = &p_codes[curidx2*2];
 1824 
 1825         qenergy += vec[0]*vec[0] + vec[1]*vec[1]
 1826                 +  vec2[0]*vec2[0] + vec2[1]*vec2[1];
 1827 
 1828         __asm__ volatile (
 1829             ".set push                                  \n\t"
 1830             ".set noreorder                             \n\t"
 1831 
 1832             "lwc1       $f0,    0(%[in_pos])            \n\t"
 1833             "lwc1       $f1,    0(%[vec])               \n\t"
 1834             "lwc1       $f2,    4(%[in_pos])            \n\t"
 1835             "lwc1       $f3,    4(%[vec])               \n\t"
 1836             "lwc1       $f4,    8(%[in_pos])            \n\t"
 1837             "lwc1       $f5,    0(%[vec2])              \n\t"
 1838             "lwc1       $f6,    12(%[in_pos])           \n\t"
 1839             "lwc1       $f7,    4(%[vec2])              \n\t"
 1840             "nmsub.s    %[di0], $f0,    $f1,    %[IQ]   \n\t"
 1841             "nmsub.s    %[di1], $f2,    $f3,    %[IQ]   \n\t"
 1842             "nmsub.s    %[di2], $f4,    $f5,    %[IQ]   \n\t"
 1843             "nmsub.s    %[di3], $f6,    $f7,    %[IQ]   \n\t"
 1844 
 1845             ".set pop                                   \n\t"
 1846 
 1847             : [di0]"=&f"(di0), [di1]"=&f"(di1),
 1848               [di2]"=&f"(di2), [di3]"=&f"(di3)
 1849             : [in_pos]"r"(in_pos), [vec]"r"(vec),
 1850               [vec2]"r"(vec2), [IQ]"f"(IQ)
 1851             : "$f0", "$f1", "$f2", "$f3",
 1852               "$f4", "$f5", "$f6", "$f7",
 1853               "memory"
 1854         );
 1855 
 1856         cost += di0 * di0 + di1 * di1
 1857                 + di2 * di2 + di3 * di3;
 1858     }
 1859 
 1860     if (bits)
 1861         *bits = curbits;
 1862     if (energy)
 1863         *energy = qenergy * (IQ*IQ);
 1864     return cost * lambda + curbits;
 1865 }
 1866 
 1867 static float get_band_cost_UPAIR7_mips(struct AACEncContext *s,
 1868                                        PutBitContext *pb, const float *in,
 1869                                        const float *scaled, int size, int scale_idx,
 1870                                        int cb, const float lambda, const float uplim,
 1871                                        int *bits, float *energy)
 1872 {
 1873     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
 1874     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
 1875     int i;
 1876     float cost = 0;
 1877     float qenergy = 0.0f;
 1878     int qc1, qc2, qc3, qc4;
 1879     int curbits = 0;
 1880 
 1881     uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
 1882     float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
 1883 
 1884     for (i = 0; i < size; i += 4) {
 1885         const float *vec, *vec2;
 1886         int curidx, curidx2, sign1, count1, sign2, count2;
 1887         int   *in_int = (int   *)&in[i];
 1888         float *in_pos = (float *)&in[i];
 1889         float di0, di1, di2, di3;
 1890         int t0, t1, t2, t3, t4;
 1891 
 1892         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
 1893         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
 1894         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
 1895         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
 1896 
 1897         __asm__ volatile (
 1898             ".set push                                          \n\t"
 1899             ".set noreorder                                     \n\t"
 1900 
 1901             "ori        %[t4],      $zero,      7               \n\t"
 1902             "ori        %[sign1],   $zero,      0               \n\t"
 1903             "ori        %[sign2],   $zero,      0               \n\t"
 1904             "slt        %[t0],      %[t4],      %[qc1]          \n\t"
 1905             "slt        %[t1],      %[t4],      %[qc2]          \n\t"
 1906             "slt        %[t2],      %[t4],      %[qc3]          \n\t"
 1907             "slt        %[t3],      %[t4],      %[qc4]          \n\t"
 1908             "movn       %[qc1],     %[t4],      %[t0]           \n\t"
 1909             "movn       %[qc2],     %[t4],      %[t1]           \n\t"
 1910             "movn       %[qc3],     %[t4],      %[t2]           \n\t"
 1911             "movn       %[qc4],     %[t4],      %[t3]           \n\t"
 1912             "lw         %[t0],      0(%[in_int])                \n\t"
 1913             "lw         %[t1],      4(%[in_int])                \n\t"
 1914             "lw         %[t2],      8(%[in_int])                \n\t"
 1915             "lw         %[t3],      12(%[in_int])               \n\t"
 1916             "slt        %[t0],      %[t0],      $zero           \n\t"
 1917             "movn       %[sign1],   %[t0],      %[qc1]          \n\t"
 1918             "slt        %[t2],      %[t2],      $zero           \n\t"
 1919             "movn       %[sign2],   %[t2],      %[qc3]          \n\t"
 1920             "slt        %[t1],      %[t1],      $zero           \n\t"
 1921             "sll        %[t0],      %[sign1],   1               \n\t"
 1922             "or         %[t0],      %[t0],      %[t1]           \n\t"
 1923             "movn       %[sign1],   %[t0],      %[qc2]          \n\t"
 1924             "slt        %[t3],      %[t3],      $zero           \n\t"
 1925             "sll        %[t0],      %[sign2],   1               \n\t"
 1926             "or         %[t0],      %[t0],      %[t3]           \n\t"
 1927             "movn       %[sign2],   %[t0],      %[qc4]          \n\t"
 1928             "slt        %[count1],  $zero,      %[qc1]          \n\t"
 1929             "slt        %[t1],      $zero,      %[qc2]          \n\t"
 1930             "slt        %[count2],  $zero,      %[qc3]          \n\t"
 1931             "slt        %[t2],      $zero,      %[qc4]          \n\t"
 1932             "addu       %[count1],  %[count1],  %[t1]           \n\t"
 1933             "addu       %[count2],  %[count2],  %[t2]           \n\t"
 1934 
 1935             ".set pop                                           \n\t"
 1936 
 1937             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
 1938               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
 1939               [sign1]"=&r"(sign1), [count1]"=&r"(count1),
 1940               [sign2]"=&r"(sign2), [count2]"=&r"(count2),
 1941               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
 1942               [t4]"=&r"(t4)
 1943             : [in_int]"r"(in_int)
 1944             : "memory"
 1945         );
 1946 
 1947         curidx = 8 * qc1;
 1948         curidx += qc2;
 1949 
 1950         curidx2 = 8 * qc3;
 1951         curidx2 += qc4;
 1952 
 1953         curbits += p_bits[curidx];
 1954         curbits += upair7_sign_bits[curidx];
 1955         vec     = &p_codes[curidx*2];
 1956 
 1957         curbits += p_bits[curidx2];
 1958         curbits += upair7_sign_bits[curidx2];
 1959         vec2    = &p_codes[curidx2*2];
 1960 
 1961         qenergy += vec[0]*vec[0] + vec[1]*vec[1]
 1962                 +  vec2[0]*vec2[0] + vec2[1]*vec2[1];
 1963 
 1964         __asm__ volatile (
 1965             ".set push                                          \n\t"
 1966             ".set noreorder                                     \n\t"
 1967 
 1968             "lwc1       %[di0],     0(%[in_pos])                \n\t"
 1969             "lwc1       %[di1],     4(%[in_pos])                \n\t"
 1970             "lwc1       %[di2],     8(%[in_pos])                \n\t"
 1971             "lwc1       %[di3],     12(%[in_pos])               \n\t"
 1972             "abs.s      %[di0],     %[di0]                      \n\t"
 1973             "abs.s      %[di1],     %[di1]                      \n\t"
 1974             "abs.s      %[di2],     %[di2]                      \n\t"
 1975             "abs.s      %[di3],     %[di3]                      \n\t"
 1976             "lwc1       $f0,        0(%[vec])                   \n\t"
 1977             "lwc1       $f1,        4(%[vec])                   \n\t"
 1978             "lwc1       $f2,        0(%[vec2])                  \n\t"
 1979             "lwc1       $f3,        4(%[vec2])                  \n\t"
 1980             "nmsub.s    %[di0],     %[di0],     $f0,    %[IQ]   \n\t"
 1981             "nmsub.s    %[di1],     %[di1],     $f1,    %[IQ]   \n\t"
 1982             "nmsub.s    %[di2],     %[di2],     $f2,    %[IQ]   \n\t"
 1983             "nmsub.s    %[di3],     %[di3],     $f3,    %[IQ]   \n\t"
 1984 
 1985             ".set pop                                           \n\t"
 1986 
 1987             : [di0]"=&f"(di0), [di1]"=&f"(di1),
 1988               [di2]"=&f"(di2), [di3]"=&f"(di3)
 1989             : [in_pos]"r"(in_pos), [vec]"r"(vec),
 1990               [vec2]"r"(vec2), [IQ]"f"(IQ)
 1991             : "$f0", "$f1", "$f2", "$f3",
 1992               "memory"
 1993         );
 1994 
 1995         cost += di0 * di0 + di1 * di1
 1996                 + di2 * di2 + di3 * di3;
 1997     }
 1998 
 1999     if (bits)
 2000         *bits = curbits;
 2001     if (energy)
 2002         *energy = qenergy * (IQ*IQ);
 2003     return cost * lambda + curbits;
 2004 }
 2005 
 2006 static float get_band_cost_UPAIR12_mips(struct AACEncContext *s,
 2007                                         PutBitContext *pb, const float *in,
 2008                                         const float *scaled, int size, int scale_idx,
 2009                                         int cb, const float lambda, const float uplim,
 2010                                         int *bits, float *energy)
 2011 {
 2012     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
 2013     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
 2014     int i;
 2015     float cost = 0;
 2016     float qenergy = 0.0f;
 2017     int qc1, qc2, qc3, qc4;
 2018     int curbits = 0;
 2019 
 2020     uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
 2021     float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
 2022 
 2023     for (i = 0; i < size; i += 4) {
 2024         const float *vec, *vec2;
 2025         int curidx, curidx2;
 2026         int sign1, count1, sign2, count2;
 2027         int   *in_int = (int   *)&in[i];
 2028         float *in_pos = (float *)&in[i];
 2029         float di0, di1, di2, di3;
 2030         int t0, t1, t2, t3, t4;
 2031 
 2032         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
 2033         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
 2034         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
 2035         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
 2036 
 2037         __asm__ volatile (
 2038             ".set push                                          \n\t"
 2039             ".set noreorder                                     \n\t"
 2040 
 2041             "ori        %[t4],      $zero,      12              \n\t"
 2042             "ori        %[sign1],   $zero,      0               \n\t"
 2043             "ori        %[sign2],   $zero,      0               \n\t"
 2044             "slt        %[t0],      %[t4],      %[qc1]          \n\t"
 2045             "slt        %[t1],      %[t4],      %[qc2]          \n\t"
 2046             "slt        %[t2],      %[t4],      %[qc3]          \n\t"
 2047             "slt        %[t3],      %[t4],      %[qc4]          \n\t"
 2048             "movn       %[qc1],     %[t4],      %[t0]           \n\t"
 2049             "movn       %[qc2],     %[t4],      %[t1]           \n\t"
 2050             "movn       %[qc3],     %[t4],      %[t2]           \n\t"
 2051             "movn       %[qc4],     %[t4],      %[t3]           \n\t"
 2052             "lw         %[t0],      0(%[in_int])                \n\t"
 2053             "lw         %[t1],      4(%[in_int])                \n\t"
 2054             "lw         %[t2],      8(%[in_int])                \n\t"
 2055             "lw         %[t3],      12(%[in_int])               \n\t"
 2056             "slt        %[t0],      %[t0],      $zero           \n\t"
 2057             "movn       %[sign1],   %[t0],      %[qc1]          \n\t"
 2058             "slt        %[t2],      %[t2],      $zero           \n\t"
 2059             "movn       %[sign2],   %[t2],      %[qc3]          \n\t"
 2060             "slt        %[t1],      %[t1],      $zero           \n\t"
 2061             "sll        %[t0],      %[sign1],   1               \n\t"
 2062             "or         %[t0],      %[t0],      %[t1]           \n\t"
 2063             "movn       %[sign1],   %[t0],      %[qc2]          \n\t"
 2064             "slt        %[t3],      %[t3],      $zero           \n\t"
 2065             "sll        %[t0],      %[sign2],   1               \n\t"
 2066             "or         %[t0],      %[t0],      %[t3]           \n\t"
 2067             "movn       %[sign2],   %[t0],      %[qc4]          \n\t"
 2068             "slt        %[count1],  $zero,      %[qc1]          \n\t"
 2069             "slt        %[t1],      $zero,      %[qc2]          \n\t"
 2070             "slt        %[count2],  $zero,      %[qc3]          \n\t"
 2071             "slt        %[t2],      $zero,      %[qc4]          \n\t"
 2072             "addu       %[count1],  %[count1],  %[t1]           \n\t"
 2073             "addu       %[count2],  %[count2],  %[t2]           \n\t"
 2074 
 2075             ".set pop                                           \n\t"
 2076 
 2077             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
 2078               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
 2079               [sign1]"=&r"(sign1), [count1]"=&r"(count1),
 2080               [sign2]"=&r"(sign2), [count2]"=&r"(count2),
 2081               [t0]"=&r"(t0), [t1]"=&r"(t1), [t2]"=&r"(t2), [t3]"=&r"(t3),
 2082               [t4]"=&r"(t4)
 2083             : [in_int]"r"(in_int)
 2084             : "memory"
 2085         );
 2086 
 2087         curidx = 13 * qc1;
 2088         curidx += qc2;
 2089 
 2090         curidx2 = 13 * qc3;
 2091         curidx2 += qc4;
 2092 
 2093         curbits += p_bits[curidx];
 2094         curbits += p_bits[curidx2];
 2095         curbits += upair12_sign_bits[curidx];
 2096         curbits += upair12_sign_bits[curidx2];
 2097         vec     = &p_codes[curidx*2];
 2098         vec2    = &p_codes[curidx2*2];
 2099 
 2100         qenergy += vec[0]*vec[0] + vec[1]*vec[1]
 2101                 +  vec2[0]*vec2[0] + vec2[1]*vec2[1];
 2102 
 2103         __asm__ volatile (
 2104             ".set push                                          \n\t"
 2105             ".set noreorder                                     \n\t"
 2106 
 2107             "lwc1       %[di0],     0(%[in_pos])                \n\t"
 2108             "lwc1       %[di1],     4(%[in_pos])                \n\t"
 2109             "lwc1       %[di2],     8(%[in_pos])                \n\t"
 2110             "lwc1       %[di3],     12(%[in_pos])               \n\t"
 2111             "abs.s      %[di0],     %[di0]                      \n\t"
 2112             "abs.s      %[di1],     %[di1]                      \n\t"
 2113             "abs.s      %[di2],     %[di2]                      \n\t"
 2114             "abs.s      %[di3],     %[di3]                      \n\t"
 2115             "lwc1       $f0,        0(%[vec])                   \n\t"
 2116             "lwc1       $f1,        4(%[vec])                   \n\t"
 2117             "lwc1       $f2,        0(%[vec2])                  \n\t"
 2118             "lwc1       $f3,        4(%[vec2])                  \n\t"
 2119             "nmsub.s    %[di0],     %[di0],     $f0,    %[IQ]   \n\t"
 2120             "nmsub.s    %[di1],     %[di1],     $f1,    %[IQ]   \n\t"
 2121             "nmsub.s    %[di2],     %[di2],     $f2,    %[IQ]   \n\t"
 2122             "nmsub.s    %[di3],     %[di3],     $f3,    %[IQ]   \n\t"
 2123 
 2124             ".set pop                                           \n\t"
 2125 
 2126             : [di0]"=&f"(di0), [di1]"=&f"(di1),
 2127               [di2]"=&f"(di2), [di3]"=&f"(di3)
 2128             : [in_pos]"r"(in_pos), [vec]"r"(vec),
 2129               [vec2]"r"(vec2), [IQ]"f"(IQ)
 2130             : "$f0", "$f1", "$f2", "$f3",
 2131               "memory"
 2132         );
 2133 
 2134         cost += di0 * di0 + di1 * di1
 2135                 + di2 * di2 + di3 * di3;
 2136     }
 2137 
 2138     if (bits)
 2139         *bits = curbits;
 2140     if (energy)
 2141         *energy = qenergy * (IQ*IQ);
 2142     return cost * lambda + curbits;
 2143 }
 2144 
 2145 static float get_band_cost_ESC_mips(struct AACEncContext *s,
 2146                                     PutBitContext *pb, const float *in,
 2147                                     const float *scaled, int size, int scale_idx,
 2148                                     int cb, const float lambda, const float uplim,
 2149                                     int *bits, float *energy)
 2150 {
 2151     const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
 2152     const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
 2153     const float CLIPPED_ESCAPE = 165140.0f * IQ;
 2154     int i;
 2155     float cost = 0;
 2156     float qenergy = 0.0f;
 2157     int qc1, qc2, qc3, qc4;
 2158     int curbits = 0;
 2159 
 2160     uint8_t *p_bits  = (uint8_t*)ff_aac_spectral_bits[cb-1];
 2161     float   *p_codes = (float*  )ff_aac_codebook_vectors[cb-1];
 2162 
 2163     for (i = 0; i < size; i += 4) {
 2164         const float *vec, *vec2;
 2165         int curidx, curidx2;
 2166         float t1, t2, t3, t4, V;
 2167         float di1, di2, di3, di4;
 2168         int cond0, cond1, cond2, cond3;
 2169         int c1, c2, c3, c4;
 2170         int t6, t7;
 2171 
 2172         qc1 = scaled[i  ] * Q34 + ROUND_STANDARD;
 2173         qc2 = scaled[i+1] * Q34 + ROUND_STANDARD;
 2174         qc3 = scaled[i+2] * Q34 + ROUND_STANDARD;
 2175         qc4 = scaled[i+3] * Q34 + ROUND_STANDARD;
 2176 
 2177         __asm__ volatile (
 2178             ".set push                                  \n\t"
 2179             ".set noreorder                             \n\t"
 2180 
 2181             "ori        %[t6],      $zero,  15          \n\t"
 2182             "ori        %[t7],      $zero,  16          \n\t"
 2183             "shll_s.w   %[c1],      %[qc1], 18          \n\t"
 2184             "shll_s.w   %[c2],      %[qc2], 18          \n\t"
 2185             "shll_s.w   %[c3],      %[qc3], 18          \n\t"
 2186             "shll_s.w   %[c4],      %[qc4], 18          \n\t"
 2187             "srl        %[c1],      %[c1],  18          \n\t"
 2188             "srl        %[c2],      %[c2],  18          \n\t"
 2189             "srl        %[c3],      %[c3],  18          \n\t"
 2190             "srl        %[c4],      %[c4],  18          \n\t"
 2191             "slt        %[cond0],   %[t6],  %[qc1]      \n\t"
 2192             "slt        %[cond1],   %[t6],  %[qc2]      \n\t"
 2193             "slt        %[cond2],   %[t6],  %[qc3]      \n\t"
 2194             "slt        %[cond3],   %[t6],  %[qc4]      \n\t"
 2195             "movn       %[qc1],     %[t7],  %[cond0]    \n\t"
 2196             "movn       %[qc2],     %[t7],  %[cond1]    \n\t"
 2197             "movn       %[qc3],     %[t7],  %[cond2]    \n\t"
 2198             "movn       %[qc4],     %[t7],  %[cond3]    \n\t"
 2199 
 2200             ".set pop                                   \n\t"
 2201 
 2202             : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
 2203               [qc3]"+r"(qc3), [qc4]"+r"(qc4),
 2204               [cond0]"=&r"(cond0), [cond1]"=&r"(cond1),
 2205               [cond2]"=&r"(cond2), [cond3]"=&r"(cond3),
 2206               [c1]"=&r"(c1), [c2]"=&r"(c2),
 2207               [c3]"=&r"(c3), [c4]"=&r"(c4),
 2208               [t6]"=&r"(t6), [t7]"=&r"(t7)
 2209         );
 2210 
 2211         curidx = 17 * qc1;
 2212         curidx += qc2;
 2213 
 2214         curidx2 = 17 * qc3;
 2215         curidx2 += qc4;
 2216 
 2217         curbits += p_bits[curidx];
 2218         curbits += esc_sign_bits[curidx];
 2219         vec     = &p_codes[curidx*2];
 2220 
 2221         curbits += p_bits[curidx2];
 2222         curbits += esc_sign_bits[curidx2];
 2223         vec2     = &p_codes[curidx2*2];
 2224 
 2225         curbits += (av_log2(c1) * 2 - 3) & (-cond0);
 2226         curbits += (av_log2(c2) * 2 - 3) & (-cond1);
 2227         curbits += (av_log2(c3) * 2 - 3) & (-cond2);
 2228         curbits += (av_log2(c4) * 2 - 3) & (-cond3);
 2229 
 2230         t1 = fabsf(in[i  ]);
 2231         t2 = fabsf(in[i+1]);
 2232         t3 = fabsf(in[i+2]);
 2233         t4 = fabsf(in[i+3]);
 2234 
 2235         if (cond0) {
 2236             if (t1 >= CLIPPED_ESCAPE) {
 2237                 di1 = t1 - CLIPPED_ESCAPE;
 2238                 qenergy += CLIPPED_ESCAPE*CLIPPED_ESCAPE;
 2239             } else {
 2240                 di1 = t1 - (V = c1 * cbrtf(c1) * IQ);
 2241                 qenergy += V*V;
 2242             }
 2243         } else {
 2244             di1 = t1 - (V = vec[0] * IQ);
 2245             qenergy += V*V;
 2246         }
 2247 
 2248         if (cond1) {
 2249             if (t2 >= CLIPPED_ESCAPE) {
 2250                 di2 = t2 - CLIPPED_ESCAPE;
 2251                 qenergy += CLIPPED_ESCAPE*CLIPPED_ESCAPE;
 2252             } else {
 2253                 di2 = t2 - (V = c2 * cbrtf(c2) * IQ);
 2254                 qenergy += V*V;
 2255             }
 2256         } else {
 2257             di2 = t2 - (V = vec[1] * IQ);
 2258             qenergy += V*V;
 2259         }
 2260 
 2261         if (cond2) {
 2262             if (t3 >= CLIPPED_ESCAPE) {
 2263                 di3 = t3 - CLIPPED_ESCAPE;
 2264                 qenergy += CLIPPED_ESCAPE*CLIPPED_ESCAPE;
 2265             } else {
 2266                 di3 = t3 - (V = c3 * cbrtf(c3) * IQ);
 2267                 qenergy += V*V;
 2268             }
 2269         } else {
 2270             di3 = t3 - (V = vec2[0] * IQ);
 2271             qenergy += V*V;
 2272         }
 2273 
 2274         if (cond3) {
 2275             if (t4 >= CLIPPED_ESCAPE) {
 2276                 di4 = t4 - CLIPPED_ESCAPE;
 2277                 qenergy += CLIPPED_ESCAPE*CLIPPED_ESCAPE;
 2278             } else {
 2279                 di4 = t4 - (V = c4 * cbrtf(c4) * IQ);
 2280                 qenergy += V*V;
 2281             }
 2282         } else {
 2283             di4 = t4 - (V = vec2[1]*IQ);
 2284             qenergy += V*V;
 2285         }
 2286 
 2287         cost += di1 * di1 + di2 * di2
 2288                 + di3 * di3 + di4 * di4;
 2289     }
 2290 
 2291     if (bits)
 2292         *bits = curbits;
 2293     return cost * lambda + curbits;
 2294 }
 2295 
 2296 static float (*const get_band_cost_arr[])(struct AACEncContext *s,
 2297                                           PutBitContext *pb, const float *in,
 2298                                           const float *scaled, int size, int scale_idx,
 2299                                           int cb, const float lambda, const float uplim,
 2300                                           int *bits, float *energy) = {
 2301     get_band_cost_ZERO_mips,
 2302     get_band_cost_SQUAD_mips,
 2303     get_band_cost_SQUAD_mips,
 2304     get_band_cost_UQUAD_mips,
 2305     get_band_cost_UQUAD_mips,
 2306     get_band_cost_SPAIR_mips,
 2307     get_band_cost_SPAIR_mips,
 2308     get_band_cost_UPAIR7_mips,
 2309     get_band_cost_UPAIR7_mips,
 2310     get_band_cost_UPAIR12_mips,
 2311     get_band_cost_UPAIR12_mips,
 2312     get_band_cost_ESC_mips,
 2313     get_band_cost_NONE_mips, /* cb 12 doesn't exist */
 2314     get_band_cost_ZERO_mips,
 2315     get_band_cost_ZERO_mips,
 2316     get_band_cost_ZERO_mips,
 2317 };
 2318 
 2319 #define get_band_cost(                                  \
 2320                                 s, pb, in, scaled, size, scale_idx, cb, \
 2321                                 lambda, uplim, bits, energy)            \
 2322     get_band_cost_arr[cb](                              \
 2323                                 s, pb, in, scaled, size, scale_idx, cb, \
 2324                                 lambda, uplim, bits, energy)
 2325 
 2326 static float quantize_band_cost(struct AACEncContext *s, const float *in,
 2327                                 const float *scaled, int size, int scale_idx,
 2328                                 int cb, const float lambda, const float uplim,
 2329                                 int *bits, float *energy, int rtz)
 2330 {
 2331     return get_band_cost(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits, energy);
 2332 }
 2333 
 2334 #include "libavcodec/aacenc_quantization_misc.h"
 2335 
 2336 #include "libavcodec/aaccoder_twoloop.h"
 2337 
 2338 static void search_for_ms_mips(AACEncContext *s, ChannelElement *cpe)
 2339 {
 2340     int start = 0, i, w, w2, g, sid_sf_boost, prev_mid, prev_side;
 2341     uint8_t nextband0[128], nextband1[128];
 2342     float M[128], S[128];
 2343     float *L34 = s->scoefs, *R34 = s->scoefs + 128, *M34 = s->scoefs + 128*2, *S34 = s->scoefs + 128*3;
 2344     const float lambda = s->lambda;
 2345     const float mslambda = FFMIN(1.0f, lambda / 120.f);
 2346     SingleChannelElement *sce0 = &cpe->ch[0];
 2347     SingleChannelElement *sce1 = &cpe->ch[1];
 2348     if (!cpe->common_window)
 2349         return;
 2350 
 2351     /** Scout out next nonzero bands */
 2352     ff_init_nextband_map(sce0, nextband0);
 2353     ff_init_nextband_map(sce1, nextband1);
 2354 
 2355     prev_mid = sce0->sf_idx[0];
 2356     prev_side = sce1->sf_idx[0];
 2357     for (w = 0; w < sce0->ics.num_windows; w += sce0->ics.group_len[w]) {
 2358         start = 0;
 2359         for (g = 0;  g < sce0->ics.num_swb; g++) {
 2360             float bmax = bval2bmax(g * 17.0f / sce0->ics.num_swb) / 0.0045f;
 2361             if (!cpe->is_mask[w*16+g])
 2362                 cpe->ms_mask[w*16+g] = 0;
 2363             if (!sce0->zeroes[w*16+g] && !sce1->zeroes[w*16+g] && !cpe->is_mask[w*16+g]) {
 2364                 float Mmax = 0.0f, Smax = 0.0f;
 2365 
 2366                 /* Must compute mid/side SF and book for the whole window group */
 2367                 for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
 2368                     for (i = 0; i < sce0->ics.swb_sizes[g]; i++) {
 2369                         M[i] = (sce0->coeffs[start+(w+w2)*128+i]
 2370                               + sce1->coeffs[start+(w+w2)*128+i]) * 0.5;
 2371                         S[i] =  M[i]
 2372                               - sce1->coeffs[start+(w+w2)*128+i];
 2373                     }
 2374                     abs_pow34_v(M34, M, sce0->ics.swb_sizes[g]);
 2375                     abs_pow34_v(S34, S, sce0->ics.swb_sizes[g]);
 2376                     for (i = 0; i < sce0->ics.swb_sizes[g]; i++ ) {
 2377                         Mmax = FFMAX(Mmax, M34[i]);
 2378                         Smax = FFMAX(Smax, S34[i]);
 2379                     }
 2380                 }
 2381 
 2382                 for (sid_sf_boost = 0; sid_sf_boost < 4; sid_sf_boost++) {
 2383                     float dist1 = 0.0f, dist2 = 0.0f;
 2384                     int B0 = 0, B1 = 0;
 2385                     int minidx;
 2386                     int mididx, sididx;
 2387                     int midcb, sidcb;
 2388 
 2389                     minidx = FFMIN(sce0->sf_idx[w*16+g], sce1->sf_idx[w*16+g]);
 2390                     mididx = av_clip(minidx, 0, SCALE_MAX_POS - SCALE_DIV_512);
 2391                     sididx = av_clip(minidx - sid_sf_boost * 3, 0, SCALE_MAX_POS - SCALE_DIV_512);
 2392                     if (sce0->band_type[w*16+g] != NOISE_BT && sce1->band_type[w*16+g] != NOISE_BT
 2393                         && (   !ff_sfdelta_can_replace(sce0, nextband0, prev_mid, mididx, w*16+g)
 2394                             || !ff_sfdelta_can_replace(sce1, nextband1, prev_side, sididx, w*16+g))) {
 2395                         /* scalefactor range violation, bad stuff, will decrease quality unacceptably */
 2396                         continue;
 2397                     }
 2398 
 2399                     midcb = find_min_book(Mmax, mididx);
 2400                     sidcb = find_min_book(Smax, sididx);
 2401 
 2402                     /* No CB can be zero */
 2403                     midcb = FFMAX(1,midcb);
 2404                     sidcb = FFMAX(1,sidcb);
 2405 
 2406                     for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
 2407                         FFPsyBand *band0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g];
 2408                         FFPsyBand *band1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g];
 2409                         float minthr = FFMIN(band0->threshold, band1->threshold);
 2410                         int b1,b2,b3,b4;
 2411                         for (i = 0; i < sce0->ics.swb_sizes[g]; i++) {
 2412                             M[i] = (sce0->coeffs[start+(w+w2)*128+i]
 2413                                   + sce1->coeffs[start+(w+w2)*128+i]) * 0.5;
 2414                             S[i] =  M[i]
 2415                                   - sce1->coeffs[start+(w+w2)*128+i];
 2416                         }
 2417 
 2418                         abs_pow34_v(L34, sce0->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]);
 2419                         abs_pow34_v(R34, sce1->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]);
 2420                         abs_pow34_v(M34, M,                         sce0->ics.swb_sizes[g]);
 2421                         abs_pow34_v(S34, S,                         sce0->ics.swb_sizes[g]);
 2422                         dist1 += quantize_band_cost(s, &sce0->coeffs[start + (w+w2)*128],
 2423                                                     L34,
 2424                                                     sce0->ics.swb_sizes[g],
 2425                                                     sce0->sf_idx[w*16+g],
 2426                                                     sce0->band_type[w*16+g],
 2427                                                     lambda / band0->threshold, INFINITY, &b1, NULL, 0);
 2428                         dist1 += quantize_band_cost(s, &sce1->coeffs[start + (w+w2)*128],
 2429                                                     R34,
 2430                                                     sce1->ics.swb_sizes[g],
 2431                                                     sce1->sf_idx[w*16+g],
 2432                                                     sce1->band_type[w*16+g],
 2433                                                     lambda / band1->threshold, INFINITY, &b2, NULL, 0);
 2434                         dist2 += quantize_band_cost(s, M,
 2435                                                     M34,
 2436                                                     sce0->ics.swb_sizes[g],
 2437                                                     mididx,
 2438                                                     midcb,
 2439                                                     lambda / minthr, INFINITY, &b3, NULL, 0);
 2440                         dist2 += quantize_band_cost(s, S,
 2441                                                     S34,
 2442                                                     sce1->ics.swb_sizes[g],
 2443                                                     sididx,
 2444                                                     sidcb,
 2445                                                     mslambda / (minthr * bmax), INFINITY, &b4, NULL, 0);
 2446                         B0 += b1+b2;
 2447                         B1 += b3+b4;
 2448                         dist1 -= b1+b2;
 2449                         dist2 -= b3+b4;
 2450                     }
 2451                     cpe->ms_mask[w*16+g] = dist2 <= dist1 && B1 < B0;
 2452                     if (cpe->ms_mask[w*16+g]) {
 2453                         if (sce0->band_type[w*16+g] != NOISE_BT && sce1->band_type[w*16+g] != NOISE_BT) {
 2454                             sce0->sf_idx[w*16+g] = mididx;
 2455                             sce1->sf_idx[w*16+g] = sididx;
 2456                             sce0->band_type[w*16+g] = midcb;
 2457                             sce1->band_type[w*16+g] = sidcb;
 2458                         } else if ((sce0->band_type[w*16+g] != NOISE_BT) ^ (sce1->band_type[w*16+g] != NOISE_BT)) {
 2459                             /* ms_mask unneeded, and it confuses some decoders */
 2460                             cpe->ms_mask[w*16+g] = 0;
 2461                         }
 2462                         break;
 2463                     } else if (B1 > B0) {
 2464                         /* More boost won't fix this */
 2465                         break;
 2466                     }
 2467                 }
 2468             }
 2469             if (!sce0->zeroes[w*16+g] && sce0->band_type[w*16+g] < RESERVED_BT)
 2470                 prev_mid = sce0->sf_idx[w*16+g];
 2471             if (!sce1->zeroes[w*16+g] && !cpe->is_mask[w*16+g] && sce1->band_type[w*16+g] < RESERVED_BT)
 2472                 prev_side = sce1->sf_idx[w*16+g];
 2473             start += sce0->ics.swb_sizes[g];
 2474         }
 2475     }
 2476 }
 2477 #endif /*HAVE_MIPSFPU */
 2478 
 2479 #include "libavcodec/aaccoder_trellis.h"
 2480 
 2481 #endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
 2482 #endif /* HAVE_INLINE_ASM */
 2483 
 2484 void ff_aac_coder_init_mips(AACEncContext *c) {
 2485 #if HAVE_INLINE_ASM
 2486 #if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
 2487     AACCoefficientsEncoder *e = c->coder;
 2488     int option = c->options.coder;
 2489 
 2490     if (option == 2) {
 2491         e->quantize_and_encode_band = quantize_and_encode_band_mips;
 2492         e->encode_window_bands_info = codebook_trellis_rate;
 2493 #if HAVE_MIPSFPU
 2494         e->search_for_quantizers    = search_for_quantizers_twoloop;
 2495 #endif /* HAVE_MIPSFPU */
 2496     }
 2497 #if HAVE_MIPSFPU
 2498     e->search_for_ms            = search_for_ms_mips;
 2499 #endif /* HAVE_MIPSFPU */
 2500 #endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
 2501 #endif /* HAVE_INLINE_ASM */
 2502 }