• Main Page
  • Related Pages
  • Modules
  • Data Structures
  • Files
  • File List
  • Globals

libavcodec/arm/dsputil_iwmmxt_rnd_template.c

Go to the documentation of this file.
00001 /*
00002  * iWMMXt optimized DSP utils
00003  * copyright (c) 2004 AGAWA Koji
00004  *
00005  * This file is part of FFmpeg.
00006  *
00007  * FFmpeg is free software; you can redistribute it and/or
00008  * modify it under the terms of the GNU Lesser General Public
00009  * License as published by the Free Software Foundation; either
00010  * version 2.1 of the License, or (at your option) any later version.
00011  *
00012  * FFmpeg is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00015  * Lesser General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU Lesser General Public
00018  * License along with FFmpeg; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00020  */
00021 
00022 void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00023 {
00024     int stride = line_size;
00025     __asm__ volatile (
00026         "and r12, %[pixels], #7 \n\t"
00027         "bic %[pixels], %[pixels], #7 \n\t"
00028         "tmcr wcgr1, r12 \n\t"
00029         "add r4, %[pixels], %[line_size] \n\t"
00030         "add r5, %[block], %[line_size] \n\t"
00031         "mov %[line_size], %[line_size], lsl #1 \n\t"
00032         "1: \n\t"
00033         "wldrd wr0, [%[pixels]] \n\t"
00034         "subs %[h], %[h], #2 \n\t"
00035         "wldrd wr1, [%[pixels], #8] \n\t"
00036         "add %[pixels], %[pixels], %[line_size] \n\t"
00037         "wldrd wr3, [r4] \n\t"
00038         "pld [%[pixels]] \n\t"
00039         "pld [%[pixels], #32] \n\t"
00040         "wldrd wr4, [r4, #8] \n\t"
00041         "add r4, r4, %[line_size] \n\t"
00042         "walignr1 wr8, wr0, wr1 \n\t"
00043         "pld [r4] \n\t"
00044         "pld [r4, #32] \n\t"
00045         "walignr1 wr10, wr3, wr4 \n\t"
00046         "wstrd wr8, [%[block]] \n\t"
00047         "add %[block], %[block], %[line_size] \n\t"
00048         "wstrd wr10, [r5] \n\t"
00049         "add r5, r5, %[line_size] \n\t"
00050         "bne 1b \n\t"
00051         : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
00052         :
00053         : "memory", "r4", "r5", "r12");
00054 }
00055 
00056 void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00057 {
00058     int stride = line_size;
00059     __asm__ volatile (
00060         "and r12, %[pixels], #7 \n\t"
00061         "bic %[pixels], %[pixels], #7 \n\t"
00062         "tmcr wcgr1, r12 \n\t"
00063         "add r4, %[pixels], %[line_size] \n\t"
00064         "add r5, %[block], %[line_size] \n\t"
00065         "mov %[line_size], %[line_size], lsl #1 \n\t"
00066         "1: \n\t"
00067         "wldrd wr0, [%[pixels]] \n\t"
00068         "subs %[h], %[h], #2 \n\t"
00069         "wldrd wr1, [%[pixels], #8] \n\t"
00070         "add %[pixels], %[pixels], %[line_size] \n\t"
00071         "wldrd wr3, [r4] \n\t"
00072         "pld [%[pixels]] \n\t"
00073         "pld [%[pixels], #32] \n\t"
00074         "wldrd wr4, [r4, #8] \n\t"
00075         "add r4, r4, %[line_size] \n\t"
00076         "walignr1 wr8, wr0, wr1 \n\t"
00077         "wldrd wr0, [%[block]] \n\t"
00078         "wldrd wr2, [r5] \n\t"
00079         "pld [r4] \n\t"
00080         "pld [r4, #32] \n\t"
00081         "walignr1 wr10, wr3, wr4 \n\t"
00082         WAVG2B" wr8, wr8, wr0 \n\t"
00083         WAVG2B" wr10, wr10, wr2 \n\t"
00084         "wstrd wr8, [%[block]] \n\t"
00085         "add %[block], %[block], %[line_size] \n\t"
00086         "wstrd wr10, [r5] \n\t"
00087         "pld [%[block]] \n\t"
00088         "pld [%[block], #32] \n\t"
00089         "add r5, r5, %[line_size] \n\t"
00090         "pld [r5] \n\t"
00091         "pld [r5, #32] \n\t"
00092         "bne 1b \n\t"
00093         : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
00094         :
00095         : "memory", "r4", "r5", "r12");
00096 }
00097 
00098 void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00099 {
00100     int stride = line_size;
00101     __asm__ volatile (
00102         "and r12, %[pixels], #7 \n\t"
00103         "bic %[pixels], %[pixels], #7 \n\t"
00104         "tmcr wcgr1, r12 \n\t"
00105         "add r4, %[pixels], %[line_size] \n\t"
00106         "add r5, %[block], %[line_size] \n\t"
00107         "mov %[line_size], %[line_size], lsl #1 \n\t"
00108         "1: \n\t"
00109         "wldrd wr0, [%[pixels]] \n\t"
00110         "wldrd wr1, [%[pixels], #8] \n\t"
00111         "subs %[h], %[h], #2 \n\t"
00112         "wldrd wr2, [%[pixels], #16] \n\t"
00113         "add %[pixels], %[pixels], %[line_size] \n\t"
00114         "wldrd wr3, [r4] \n\t"
00115         "pld [%[pixels]] \n\t"
00116         "pld [%[pixels], #32] \n\t"
00117         "walignr1 wr8, wr0, wr1 \n\t"
00118         "wldrd wr4, [r4, #8] \n\t"
00119         "walignr1 wr9, wr1, wr2 \n\t"
00120         "wldrd wr5, [r4, #16] \n\t"
00121         "add r4, r4, %[line_size] \n\t"
00122         "pld [r4] \n\t"
00123         "pld [r4, #32] \n\t"
00124         "walignr1 wr10, wr3, wr4 \n\t"
00125         "wstrd wr8, [%[block]] \n\t"
00126         "walignr1 wr11, wr4, wr5 \n\t"
00127         "wstrd wr9, [%[block], #8] \n\t"
00128         "add %[block], %[block], %[line_size] \n\t"
00129         "wstrd wr10, [r5] \n\t"
00130         "wstrd wr11, [r5, #8] \n\t"
00131         "add r5, r5, %[line_size] \n\t"
00132         "bne 1b \n\t"
00133         : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
00134         :
00135         : "memory", "r4", "r5", "r12");
00136 }
00137 
00138 void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00139 {
00140     int stride = line_size;
00141     __asm__ volatile (
00142         "pld [%[pixels]]                \n\t"
00143         "pld [%[pixels], #32]           \n\t"
00144         "pld [%[block]]                 \n\t"
00145         "pld [%[block], #32]            \n\t"
00146         "and r12, %[pixels], #7         \n\t"
00147         "bic %[pixels], %[pixels], #7   \n\t"
00148         "tmcr wcgr1, r12                \n\t"
00149         "add r4, %[pixels], %[line_size]\n\t"
00150         "add r5, %[block], %[line_size] \n\t"
00151         "mov %[line_size], %[line_size], lsl #1 \n\t"
00152         "1:                             \n\t"
00153         "wldrd wr0, [%[pixels]]         \n\t"
00154         "wldrd wr1, [%[pixels], #8]     \n\t"
00155         "subs %[h], %[h], #2            \n\t"
00156         "wldrd wr2, [%[pixels], #16]    \n\t"
00157         "add %[pixels], %[pixels], %[line_size] \n\t"
00158         "wldrd wr3, [r4]                \n\t"
00159         "pld [%[pixels]]                \n\t"
00160         "pld [%[pixels], #32]           \n\t"
00161         "walignr1 wr8, wr0, wr1         \n\t"
00162         "wldrd wr4, [r4, #8]            \n\t"
00163         "walignr1 wr9, wr1, wr2         \n\t"
00164         "wldrd wr5, [r4, #16]           \n\t"
00165         "add r4, r4, %[line_size]       \n\t"
00166         "wldrd wr0, [%[block]]          \n\t"
00167         "pld [r4]                       \n\t"
00168         "wldrd wr1, [%[block], #8]      \n\t"
00169         "pld [r4, #32]                  \n\t"
00170         "wldrd wr2, [r5]                \n\t"
00171         "walignr1 wr10, wr3, wr4        \n\t"
00172         "wldrd wr3, [r5, #8]            \n\t"
00173         WAVG2B" wr8, wr8, wr0           \n\t"
00174         WAVG2B" wr9, wr9, wr1           \n\t"
00175         WAVG2B" wr10, wr10, wr2         \n\t"
00176         "wstrd wr8, [%[block]]          \n\t"
00177         "walignr1 wr11, wr4, wr5        \n\t"
00178         WAVG2B" wr11, wr11, wr3         \n\t"
00179         "wstrd wr9, [%[block], #8]      \n\t"
00180         "add %[block], %[block], %[line_size] \n\t"
00181         "wstrd wr10, [r5]               \n\t"
00182         "pld [%[block]]                 \n\t"
00183         "pld [%[block], #32]            \n\t"
00184         "wstrd wr11, [r5, #8]           \n\t"
00185         "add r5, r5, %[line_size]       \n\t"
00186         "pld [r5]                       \n\t"
00187         "pld [r5, #32]                  \n\t"
00188         "bne 1b \n\t"
00189         : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
00190         :
00191         : "memory", "r4", "r5", "r12");
00192 }
00193 
00194 void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00195 {
00196     int stride = line_size;
00197     // [wr0 wr1 wr2 wr3] for previous line
00198     // [wr4 wr5 wr6 wr7] for current line
00199     SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
00200     __asm__ volatile(
00201         "pld [%[pixels]]                \n\t"
00202         "pld [%[pixels], #32]           \n\t"
00203         "and r12, %[pixels], #7         \n\t"
00204         "bic %[pixels], %[pixels], #7   \n\t"
00205         "tmcr wcgr1, r12                \n\t"
00206         "add r12, r12, #1               \n\t"
00207         "add r4, %[pixels], %[line_size]\n\t"
00208         "tmcr wcgr2, r12                \n\t"
00209         "add r5, %[block], %[line_size] \n\t"
00210         "mov %[line_size], %[line_size], lsl #1 \n\t"
00211 
00212         "1:                             \n\t"
00213         "wldrd wr10, [%[pixels]]        \n\t"
00214         "cmp r12, #8                    \n\t"
00215         "wldrd wr11, [%[pixels], #8]    \n\t"
00216         "add %[pixels], %[pixels], %[line_size] \n\t"
00217         "wldrd wr13, [r4]               \n\t"
00218         "pld [%[pixels]]                \n\t"
00219         "wldrd wr14, [r4, #8]           \n\t"
00220         "pld [%[pixels], #32]           \n\t"
00221         "add r4, r4, %[line_size]       \n\t"
00222         "walignr1 wr0, wr10, wr11       \n\t"
00223         "pld [r4]                       \n\t"
00224         "pld [r4, #32]                  \n\t"
00225         "walignr1 wr2, wr13, wr14       \n\t"
00226         "wmoveq wr4, wr11               \n\t"
00227         "wmoveq wr6, wr14               \n\t"
00228         "walignr2ne wr4, wr10, wr11     \n\t"
00229         "walignr2ne wr6, wr13, wr14     \n\t"
00230         WAVG2B" wr0, wr0, wr4           \n\t"
00231         WAVG2B" wr2, wr2, wr6           \n\t"
00232         "wstrd wr0, [%[block]]          \n\t"
00233         "subs %[h], %[h], #2            \n\t"
00234         "wstrd wr2, [r5]                \n\t"
00235         "add %[block], %[block], %[line_size]   \n\t"
00236         "add r5, r5, %[line_size]       \n\t"
00237         "bne 1b                         \n\t"
00238         : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
00239         :
00240         : "r4", "r5", "r12", "memory");
00241 }
00242 
00243 void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00244 {
00245     int stride = line_size;
00246     // [wr0 wr1 wr2 wr3] for previous line
00247     // [wr4 wr5 wr6 wr7] for current line
00248     SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
00249     __asm__ volatile(
00250         "pld [%[pixels]]                \n\t"
00251         "pld [%[pixels], #32]           \n\t"
00252         "and r12, %[pixels], #7         \n\t"
00253         "bic %[pixels], %[pixels], #7   \n\t"
00254         "tmcr wcgr1, r12                \n\t"
00255         "add r12, r12, #1               \n\t"
00256         "add r4, %[pixels], %[line_size]\n\t"
00257         "tmcr wcgr2, r12                \n\t"
00258         "add r5, %[block], %[line_size] \n\t"
00259         "mov %[line_size], %[line_size], lsl #1 \n\t"
00260 
00261         "1:                             \n\t"
00262         "wldrd wr10, [%[pixels]]        \n\t"
00263         "cmp r12, #8                    \n\t"
00264         "wldrd wr11, [%[pixels], #8]    \n\t"
00265         "wldrd wr12, [%[pixels], #16]   \n\t"
00266         "add %[pixels], %[pixels], %[line_size] \n\t"
00267         "wldrd wr13, [r4]               \n\t"
00268         "pld [%[pixels]]                \n\t"
00269         "wldrd wr14, [r4, #8]           \n\t"
00270         "pld [%[pixels], #32]           \n\t"
00271         "wldrd wr15, [r4, #16]          \n\t"
00272         "add r4, r4, %[line_size]       \n\t"
00273         "walignr1 wr0, wr10, wr11       \n\t"
00274         "pld [r4]                       \n\t"
00275         "pld [r4, #32]                  \n\t"
00276         "walignr1 wr1, wr11, wr12       \n\t"
00277         "walignr1 wr2, wr13, wr14       \n\t"
00278         "walignr1 wr3, wr14, wr15       \n\t"
00279         "wmoveq wr4, wr11               \n\t"
00280         "wmoveq wr5, wr12               \n\t"
00281         "wmoveq wr6, wr14               \n\t"
00282         "wmoveq wr7, wr15               \n\t"
00283         "walignr2ne wr4, wr10, wr11     \n\t"
00284         "walignr2ne wr5, wr11, wr12     \n\t"
00285         "walignr2ne wr6, wr13, wr14     \n\t"
00286         "walignr2ne wr7, wr14, wr15     \n\t"
00287         WAVG2B" wr0, wr0, wr4           \n\t"
00288         WAVG2B" wr1, wr1, wr5           \n\t"
00289         "wstrd wr0, [%[block]]          \n\t"
00290         WAVG2B" wr2, wr2, wr6           \n\t"
00291         "wstrd wr1, [%[block], #8]      \n\t"
00292         WAVG2B" wr3, wr3, wr7           \n\t"
00293         "add %[block], %[block], %[line_size]   \n\t"
00294         "wstrd wr2, [r5]                \n\t"
00295         "subs %[h], %[h], #2            \n\t"
00296         "wstrd wr3, [r5, #8]            \n\t"
00297         "add r5, r5, %[line_size]       \n\t"
00298         "bne 1b                         \n\t"
00299         : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
00300         :
00301         : "r4", "r5", "r12", "memory");
00302 }
00303 
00304 void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00305 {
00306     int stride = line_size;
00307     // [wr0 wr1 wr2 wr3] for previous line
00308     // [wr4 wr5 wr6 wr7] for current line
00309     SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
00310     __asm__ volatile(
00311         "pld [%[pixels]]                \n\t"
00312         "pld [%[pixels], #32]           \n\t"
00313         "pld [%[block]]                 \n\t"
00314         "pld [%[block], #32]            \n\t"
00315         "and r12, %[pixels], #7         \n\t"
00316         "bic %[pixels], %[pixels], #7   \n\t"
00317         "tmcr wcgr1, r12                \n\t"
00318         "add r12, r12, #1               \n\t"
00319         "add r4, %[pixels], %[line_size]\n\t"
00320         "tmcr wcgr2, r12                \n\t"
00321         "add r5, %[block], %[line_size] \n\t"
00322         "mov %[line_size], %[line_size], lsl #1 \n\t"
00323         "pld [r5]                       \n\t"
00324         "pld [r5, #32]                  \n\t"
00325 
00326         "1:                             \n\t"
00327         "wldrd wr10, [%[pixels]]        \n\t"
00328         "cmp r12, #8                    \n\t"
00329         "wldrd wr11, [%[pixels], #8]    \n\t"
00330         "add %[pixels], %[pixels], %[line_size] \n\t"
00331         "wldrd wr13, [r4]               \n\t"
00332         "pld [%[pixels]]                \n\t"
00333         "wldrd wr14, [r4, #8]           \n\t"
00334         "pld [%[pixels], #32]           \n\t"
00335         "add r4, r4, %[line_size]       \n\t"
00336         "walignr1 wr0, wr10, wr11       \n\t"
00337         "pld [r4]                       \n\t"
00338         "pld [r4, #32]                  \n\t"
00339         "walignr1 wr2, wr13, wr14       \n\t"
00340         "wmoveq wr4, wr11               \n\t"
00341         "wmoveq wr6, wr14               \n\t"
00342         "walignr2ne wr4, wr10, wr11     \n\t"
00343         "wldrd wr10, [%[block]]         \n\t"
00344         "walignr2ne wr6, wr13, wr14     \n\t"
00345         "wldrd wr12, [r5]               \n\t"
00346         WAVG2B" wr0, wr0, wr4           \n\t"
00347         WAVG2B" wr2, wr2, wr6           \n\t"
00348         WAVG2B" wr0, wr0, wr10          \n\t"
00349         WAVG2B" wr2, wr2, wr12          \n\t"
00350         "wstrd wr0, [%[block]]          \n\t"
00351         "subs %[h], %[h], #2            \n\t"
00352         "wstrd wr2, [r5]                \n\t"
00353         "add %[block], %[block], %[line_size]   \n\t"
00354         "add r5, r5, %[line_size]       \n\t"
00355         "pld [%[block]]                 \n\t"
00356         "pld [%[block], #32]            \n\t"
00357         "pld [r5]                       \n\t"
00358         "pld [r5, #32]                  \n\t"
00359         "bne 1b                         \n\t"
00360         : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
00361         :
00362         : "r4", "r5", "r12", "memory");
00363 }
00364 
00365 void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00366 {
00367     int stride = line_size;
00368     // [wr0 wr1 wr2 wr3] for previous line
00369     // [wr4 wr5 wr6 wr7] for current line
00370     SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
00371     __asm__ volatile(
00372         "pld [%[pixels]]                \n\t"
00373         "pld [%[pixels], #32]           \n\t"
00374         "pld [%[block]]                 \n\t"
00375         "pld [%[block], #32]            \n\t"
00376         "and r12, %[pixels], #7         \n\t"
00377         "bic %[pixels], %[pixels], #7   \n\t"
00378         "tmcr wcgr1, r12                \n\t"
00379         "add r12, r12, #1               \n\t"
00380         "add r4, %[pixels], %[line_size]\n\t"
00381         "tmcr wcgr2, r12                \n\t"
00382         "add r5, %[block], %[line_size] \n\t"
00383         "mov %[line_size], %[line_size], lsl #1 \n\t"
00384         "pld [r5]                       \n\t"
00385         "pld [r5, #32]                  \n\t"
00386 
00387         "1:                             \n\t"
00388         "wldrd wr10, [%[pixels]]        \n\t"
00389         "cmp r12, #8                    \n\t"
00390         "wldrd wr11, [%[pixels], #8]    \n\t"
00391         "wldrd wr12, [%[pixels], #16]   \n\t"
00392         "add %[pixels], %[pixels], %[line_size] \n\t"
00393         "wldrd wr13, [r4]               \n\t"
00394         "pld [%[pixels]]                \n\t"
00395         "wldrd wr14, [r4, #8]           \n\t"
00396         "pld [%[pixels], #32]           \n\t"
00397         "wldrd wr15, [r4, #16]          \n\t"
00398         "add r4, r4, %[line_size]       \n\t"
00399         "walignr1 wr0, wr10, wr11       \n\t"
00400         "pld [r4]                       \n\t"
00401         "pld [r4, #32]                  \n\t"
00402         "walignr1 wr1, wr11, wr12       \n\t"
00403         "walignr1 wr2, wr13, wr14       \n\t"
00404         "walignr1 wr3, wr14, wr15       \n\t"
00405         "wmoveq wr4, wr11               \n\t"
00406         "wmoveq wr5, wr12               \n\t"
00407         "wmoveq wr6, wr14               \n\t"
00408         "wmoveq wr7, wr15               \n\t"
00409         "walignr2ne wr4, wr10, wr11     \n\t"
00410         "walignr2ne wr5, wr11, wr12     \n\t"
00411         "walignr2ne wr6, wr13, wr14     \n\t"
00412         "walignr2ne wr7, wr14, wr15     \n\t"
00413         "wldrd wr10, [%[block]]         \n\t"
00414         WAVG2B" wr0, wr0, wr4           \n\t"
00415         "wldrd wr11, [%[block], #8]     \n\t"
00416         WAVG2B" wr1, wr1, wr5           \n\t"
00417         "wldrd wr12, [r5]               \n\t"
00418         WAVG2B" wr2, wr2, wr6           \n\t"
00419         "wldrd wr13, [r5, #8]           \n\t"
00420         WAVG2B" wr3, wr3, wr7           \n\t"
00421         WAVG2B" wr0, wr0, wr10          \n\t"
00422         WAVG2B" wr1, wr1, wr11          \n\t"
00423         WAVG2B" wr2, wr2, wr12          \n\t"
00424         WAVG2B" wr3, wr3, wr13          \n\t"
00425         "wstrd wr0, [%[block]]          \n\t"
00426         "subs %[h], %[h], #2            \n\t"
00427         "wstrd wr1, [%[block], #8]      \n\t"
00428         "add %[block], %[block], %[line_size]   \n\t"
00429         "wstrd wr2, [r5]                \n\t"
00430         "pld [%[block]]                 \n\t"
00431         "wstrd wr3, [r5, #8]            \n\t"
00432         "add r5, r5, %[line_size]       \n\t"
00433         "pld [%[block], #32]            \n\t"
00434         "pld [r5]                       \n\t"
00435         "pld [r5, #32]                  \n\t"
00436         "bne 1b                         \n\t"
00437         : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
00438         :
00439         :"r4", "r5", "r12", "memory");
00440 }
00441 
00442 void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00443 {
00444     int stride = line_size;
00445     // [wr0 wr1 wr2 wr3] for previous line
00446     // [wr4 wr5 wr6 wr7] for current line
00447     __asm__ volatile(
00448         "pld            [%[pixels]]                             \n\t"
00449         "pld            [%[pixels], #32]                        \n\t"
00450         "and            r12, %[pixels], #7                      \n\t"
00451         "tmcr           wcgr1, r12                              \n\t"
00452         "bic            %[pixels], %[pixels], #7                \n\t"
00453 
00454         "wldrd          wr10, [%[pixels]]                       \n\t"
00455         "wldrd          wr11, [%[pixels], #8]                   \n\t"
00456         "pld            [%[block]]                              \n\t"
00457         "add            %[pixels], %[pixels], %[line_size]      \n\t"
00458         "walignr1       wr0, wr10, wr11                         \n\t"
00459         "pld            [%[pixels]]                             \n\t"
00460         "pld            [%[pixels], #32]                        \n\t"
00461 
00462       "1:                                                       \n\t"
00463         "wldrd          wr10, [%[pixels]]                       \n\t"
00464         "wldrd          wr11, [%[pixels], #8]                   \n\t"
00465         "add            %[pixels], %[pixels], %[line_size]      \n\t"
00466         "pld            [%[pixels]]                             \n\t"
00467         "pld            [%[pixels], #32]                        \n\t"
00468         "walignr1       wr4, wr10, wr11                         \n\t"
00469         "wldrd          wr10, [%[block]]                        \n\t"
00470          WAVG2B"        wr8, wr0, wr4                           \n\t"
00471          WAVG2B"        wr8, wr8, wr10                          \n\t"
00472         "wstrd          wr8, [%[block]]                         \n\t"
00473         "add            %[block], %[block], %[line_size]        \n\t"
00474 
00475         "wldrd          wr10, [%[pixels]]                       \n\t"
00476         "wldrd          wr11, [%[pixels], #8]                   \n\t"
00477         "pld            [%[block]]                              \n\t"
00478         "add            %[pixels], %[pixels], %[line_size]      \n\t"
00479         "pld            [%[pixels]]                             \n\t"
00480         "pld            [%[pixels], #32]                        \n\t"
00481         "walignr1       wr0, wr10, wr11                         \n\t"
00482         "wldrd          wr10, [%[block]]                        \n\t"
00483          WAVG2B"        wr8, wr0, wr4                           \n\t"
00484          WAVG2B"        wr8, wr8, wr10                          \n\t"
00485         "wstrd          wr8, [%[block]]                         \n\t"
00486         "add            %[block], %[block], %[line_size]        \n\t"
00487 
00488         "subs           %[h], %[h], #2                          \n\t"
00489         "pld            [%[block]]                              \n\t"
00490         "bne            1b                                      \n\t"
00491         : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
00492         :
00493         : "cc", "memory", "r12");
00494 }
00495 
00496 void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00497 {
00498     int stride = line_size;
00499     // [wr0 wr1 wr2 wr3] for previous line
00500     // [wr4 wr5 wr6 wr7] for current line
00501     __asm__ volatile(
00502         "pld [%[pixels]]                \n\t"
00503         "pld [%[pixels], #32]           \n\t"
00504         "and r12, %[pixels], #7         \n\t"
00505         "tmcr wcgr1, r12                \n\t"
00506         "bic %[pixels], %[pixels], #7   \n\t"
00507 
00508         "wldrd wr10, [%[pixels]]        \n\t"
00509         "wldrd wr11, [%[pixels], #8]    \n\t"
00510         "wldrd wr12, [%[pixels], #16]   \n\t"
00511         "add %[pixels], %[pixels], %[line_size] \n\t"
00512         "pld [%[pixels]]                \n\t"
00513         "pld [%[pixels], #32]           \n\t"
00514         "walignr1 wr0, wr10, wr11       \n\t"
00515         "walignr1 wr1, wr11, wr12       \n\t"
00516 
00517         "1:                             \n\t"
00518         "wldrd wr10, [%[pixels]]        \n\t"
00519         "wldrd wr11, [%[pixels], #8]    \n\t"
00520         "wldrd wr12, [%[pixels], #16]   \n\t"
00521         "add %[pixels], %[pixels], %[line_size] \n\t"
00522         "pld [%[pixels]]                \n\t"
00523         "pld [%[pixels], #32]           \n\t"
00524         "walignr1 wr4, wr10, wr11       \n\t"
00525         "walignr1 wr5, wr11, wr12       \n\t"
00526         WAVG2B" wr8, wr0, wr4           \n\t"
00527         WAVG2B" wr9, wr1, wr5           \n\t"
00528         "wstrd wr8, [%[block]]          \n\t"
00529         "wstrd wr9, [%[block], #8]      \n\t"
00530         "add %[block], %[block], %[line_size]   \n\t"
00531 
00532         "wldrd wr10, [%[pixels]]        \n\t"
00533         "wldrd wr11, [%[pixels], #8]    \n\t"
00534         "wldrd wr12, [%[pixels], #16]   \n\t"
00535         "add %[pixels], %[pixels], %[line_size] \n\t"
00536         "pld [%[pixels]]                \n\t"
00537         "pld [%[pixels], #32]           \n\t"
00538         "walignr1 wr0, wr10, wr11       \n\t"
00539         "walignr1 wr1, wr11, wr12       \n\t"
00540         WAVG2B" wr8, wr0, wr4           \n\t"
00541         WAVG2B" wr9, wr1, wr5           \n\t"
00542         "wstrd wr8, [%[block]]          \n\t"
00543         "wstrd wr9, [%[block], #8]      \n\t"
00544         "add %[block], %[block], %[line_size]   \n\t"
00545 
00546         "subs %[h], %[h], #2            \n\t"
00547         "bne 1b                         \n\t"
00548         : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
00549         :
00550         : "r4", "r5", "r12", "memory");
00551 }
00552 
00553 void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00554 {
00555     int stride = line_size;
00556     // [wr0 wr1 wr2 wr3] for previous line
00557     // [wr4 wr5 wr6 wr7] for current line
00558     __asm__ volatile(
00559         "pld [%[pixels]]                \n\t"
00560         "pld [%[pixels], #32]           \n\t"
00561         "and r12, %[pixels], #7         \n\t"
00562         "tmcr wcgr1, r12                \n\t"
00563         "bic %[pixels], %[pixels], #7   \n\t"
00564 
00565         "wldrd wr10, [%[pixels]]        \n\t"
00566         "wldrd wr11, [%[pixels], #8]    \n\t"
00567         "pld [%[block]]                 \n\t"
00568         "wldrd wr12, [%[pixels], #16]   \n\t"
00569         "add %[pixels], %[pixels], %[line_size] \n\t"
00570         "pld [%[pixels]]                \n\t"
00571         "pld [%[pixels], #32]           \n\t"
00572         "walignr1 wr0, wr10, wr11       \n\t"
00573         "walignr1 wr1, wr11, wr12       \n\t"
00574 
00575         "1:                             \n\t"
00576         "wldrd wr10, [%[pixels]]        \n\t"
00577         "wldrd wr11, [%[pixels], #8]    \n\t"
00578         "wldrd wr12, [%[pixels], #16]   \n\t"
00579         "add %[pixels], %[pixels], %[line_size] \n\t"
00580         "pld [%[pixels]]                \n\t"
00581         "pld [%[pixels], #32]           \n\t"
00582         "walignr1 wr4, wr10, wr11       \n\t"
00583         "walignr1 wr5, wr11, wr12       \n\t"
00584         "wldrd wr10, [%[block]]         \n\t"
00585         "wldrd wr11, [%[block], #8]     \n\t"
00586         WAVG2B" wr8, wr0, wr4           \n\t"
00587         WAVG2B" wr9, wr1, wr5           \n\t"
00588         WAVG2B" wr8, wr8, wr10          \n\t"
00589         WAVG2B" wr9, wr9, wr11          \n\t"
00590         "wstrd wr8, [%[block]]          \n\t"
00591         "wstrd wr9, [%[block], #8]      \n\t"
00592         "add %[block], %[block], %[line_size]   \n\t"
00593 
00594         "wldrd wr10, [%[pixels]]        \n\t"
00595         "wldrd wr11, [%[pixels], #8]    \n\t"
00596         "pld [%[block]]                 \n\t"
00597         "wldrd wr12, [%[pixels], #16]   \n\t"
00598         "add %[pixels], %[pixels], %[line_size] \n\t"
00599         "pld [%[pixels]]                \n\t"
00600         "pld [%[pixels], #32]           \n\t"
00601         "walignr1 wr0, wr10, wr11       \n\t"
00602         "walignr1 wr1, wr11, wr12       \n\t"
00603         "wldrd wr10, [%[block]]         \n\t"
00604         "wldrd wr11, [%[block], #8]     \n\t"
00605         WAVG2B" wr8, wr0, wr4           \n\t"
00606         WAVG2B" wr9, wr1, wr5           \n\t"
00607         WAVG2B" wr8, wr8, wr10          \n\t"
00608         WAVG2B" wr9, wr9, wr11          \n\t"
00609         "wstrd wr8, [%[block]]          \n\t"
00610         "wstrd wr9, [%[block], #8]      \n\t"
00611         "add %[block], %[block], %[line_size]   \n\t"
00612 
00613         "subs %[h], %[h], #2            \n\t"
00614         "pld [%[block]]                 \n\t"
00615         "bne 1b                         \n\t"
00616         : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
00617         :
00618         : "r4", "r5", "r12", "memory");
00619 }
00620 
00621 void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00622 {
00623     // [wr0 wr1 wr2 wr3] for previous line
00624     // [wr4 wr5 wr6 wr7] for current line
00625     SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
00626     __asm__ volatile(
00627         "pld [%[pixels]]                \n\t"
00628         "mov r12, #2                    \n\t"
00629         "pld [%[pixels], #32]           \n\t"
00630         "tmcr wcgr0, r12                \n\t" /* for shift value */
00631         "and r12, %[pixels], #7         \n\t"
00632         "bic %[pixels], %[pixels], #7   \n\t"
00633         "tmcr wcgr1, r12                \n\t"
00634 
00635         // [wr0 wr1 wr2 wr3] <= *
00636         // [wr4 wr5 wr6 wr7]
00637         "wldrd wr12, [%[pixels]]        \n\t"
00638         "add r12, r12, #1               \n\t"
00639         "wldrd wr13, [%[pixels], #8]    \n\t"
00640         "tmcr wcgr2, r12                \n\t"
00641         "add %[pixels], %[pixels], %[line_size] \n\t"
00642         "cmp r12, #8                    \n\t"
00643         "pld [%[pixels]]                \n\t"
00644         "pld [%[pixels], #32]           \n\t"
00645         "walignr1 wr2, wr12, wr13       \n\t"
00646         "wmoveq wr10, wr13              \n\t"
00647         "walignr2ne wr10, wr12, wr13    \n\t"
00648         "wunpckelub wr0, wr2            \n\t"
00649         "wunpckehub wr1, wr2            \n\t"
00650         "wunpckelub wr8, wr10           \n\t"
00651         "wunpckehub wr9, wr10           \n\t"
00652         "waddhus wr0, wr0, wr8          \n\t"
00653         "waddhus wr1, wr1, wr9          \n\t"
00654 
00655         "1:                             \n\t"
00656         // [wr0 wr1 wr2 wr3]
00657         // [wr4 wr5 wr6 wr7] <= *
00658         "wldrd wr12, [%[pixels]]        \n\t"
00659         "cmp r12, #8                    \n\t"
00660         "wldrd wr13, [%[pixels], #8]    \n\t"
00661         "add %[pixels], %[pixels], %[line_size] \n\t"
00662         "walignr1 wr6, wr12, wr13       \n\t"
00663         "pld [%[pixels]]                \n\t"
00664         "pld [%[pixels], #32]           \n\t"
00665         "wmoveq wr10, wr13              \n\t"
00666         "walignr2ne wr10, wr12, wr13    \n\t"
00667         "wunpckelub wr4, wr6            \n\t"
00668         "wunpckehub wr5, wr6            \n\t"
00669         "wunpckelub wr8, wr10           \n\t"
00670         "wunpckehub wr9, wr10           \n\t"
00671         "waddhus wr4, wr4, wr8          \n\t"
00672         "waddhus wr5, wr5, wr9          \n\t"
00673         "waddhus wr8, wr0, wr4          \n\t"
00674         "waddhus wr9, wr1, wr5          \n\t"
00675         "waddhus wr8, wr8, wr15         \n\t"
00676         "waddhus wr9, wr9, wr15         \n\t"
00677         "wsrlhg wr8, wr8, wcgr0         \n\t"
00678         "wsrlhg wr9, wr9, wcgr0         \n\t"
00679         "wpackhus wr8, wr8, wr9         \n\t"
00680         "wstrd wr8, [%[block]]          \n\t"
00681         "add %[block], %[block], %[line_size]   \n\t"
00682 
00683         // [wr0 wr1 wr2 wr3] <= *
00684         // [wr4 wr5 wr6 wr7]
00685         "wldrd wr12, [%[pixels]]        \n\t"
00686         "wldrd wr13, [%[pixels], #8]    \n\t"
00687         "add %[pixels], %[pixels], %[line_size] \n\t"
00688         "walignr1 wr2, wr12, wr13       \n\t"
00689         "pld [%[pixels]]                \n\t"
00690         "pld [%[pixels], #32]           \n\t"
00691         "wmoveq wr10, wr13              \n\t"
00692         "walignr2ne wr10, wr12, wr13    \n\t"
00693         "wunpckelub wr0, wr2            \n\t"
00694         "wunpckehub wr1, wr2            \n\t"
00695         "wunpckelub wr8, wr10           \n\t"
00696         "wunpckehub wr9, wr10           \n\t"
00697         "waddhus wr0, wr0, wr8          \n\t"
00698         "waddhus wr1, wr1, wr9          \n\t"
00699         "waddhus wr8, wr0, wr4          \n\t"
00700         "waddhus wr9, wr1, wr5          \n\t"
00701         "waddhus wr8, wr8, wr15         \n\t"
00702         "waddhus wr9, wr9, wr15         \n\t"
00703         "wsrlhg wr8, wr8, wcgr0         \n\t"
00704         "wsrlhg wr9, wr9, wcgr0         \n\t"
00705         "wpackhus wr8, wr8, wr9         \n\t"
00706         "subs %[h], %[h], #2            \n\t"
00707         "wstrd wr8, [%[block]]          \n\t"
00708         "add %[block], %[block], %[line_size]   \n\t"
00709         "bne 1b                         \n\t"
00710         : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
00711         : [line_size]"r"(line_size)
00712         : "r12", "memory");
00713 }
00714 
00715 void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00716 {
00717     // [wr0 wr1 wr2 wr3] for previous line
00718     // [wr4 wr5 wr6 wr7] for current line
00719     SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
00720     __asm__ volatile(
00721         "pld [%[pixels]]                \n\t"
00722         "mov r12, #2                    \n\t"
00723         "pld [%[pixels], #32]           \n\t"
00724         "tmcr wcgr0, r12                \n\t" /* for shift value */
00725         /* alignment */
00726         "and r12, %[pixels], #7         \n\t"
00727         "bic %[pixels], %[pixels], #7   \n\t"
00728         "tmcr wcgr1, r12                \n\t"
00729         "add r12, r12, #1               \n\t"
00730         "tmcr wcgr2, r12                \n\t"
00731 
00732         // [wr0 wr1 wr2 wr3] <= *
00733         // [wr4 wr5 wr6 wr7]
00734         "wldrd wr12, [%[pixels]]        \n\t"
00735         "cmp r12, #8                    \n\t"
00736         "wldrd wr13, [%[pixels], #8]    \n\t"
00737         "wldrd wr14, [%[pixels], #16]   \n\t"
00738         "add %[pixels], %[pixels], %[line_size] \n\t"
00739         "pld [%[pixels]]                \n\t"
00740         "walignr1 wr2, wr12, wr13       \n\t"
00741         "pld [%[pixels], #32]           \n\t"
00742         "walignr1 wr3, wr13, wr14       \n\t"
00743         "wmoveq wr10, wr13              \n\t"
00744         "wmoveq wr11, wr14              \n\t"
00745         "walignr2ne wr10, wr12, wr13    \n\t"
00746         "walignr2ne wr11, wr13, wr14    \n\t"
00747         "wunpckelub wr0, wr2            \n\t"
00748         "wunpckehub wr1, wr2            \n\t"
00749         "wunpckelub wr2, wr3            \n\t"
00750         "wunpckehub wr3, wr3            \n\t"
00751         "wunpckelub wr8, wr10           \n\t"
00752         "wunpckehub wr9, wr10           \n\t"
00753         "wunpckelub wr10, wr11          \n\t"
00754         "wunpckehub wr11, wr11          \n\t"
00755         "waddhus wr0, wr0, wr8          \n\t"
00756         "waddhus wr1, wr1, wr9          \n\t"
00757         "waddhus wr2, wr2, wr10         \n\t"
00758         "waddhus wr3, wr3, wr11         \n\t"
00759 
00760         "1:                             \n\t"
00761         // [wr0 wr1 wr2 wr3]
00762         // [wr4 wr5 wr6 wr7] <= *
00763         "wldrd wr12, [%[pixels]]        \n\t"
00764         "cmp r12, #8                    \n\t"
00765         "wldrd wr13, [%[pixels], #8]    \n\t"
00766         "wldrd wr14, [%[pixels], #16]   \n\t"
00767         "add %[pixels], %[pixels], %[line_size] \n\t"
00768         "walignr1 wr6, wr12, wr13       \n\t"
00769         "pld [%[pixels]]                \n\t"
00770         "pld [%[pixels], #32]           \n\t"
00771         "walignr1 wr7, wr13, wr14       \n\t"
00772         "wmoveq wr10, wr13              \n\t"
00773         "wmoveq wr11, wr14              \n\t"
00774         "walignr2ne wr10, wr12, wr13    \n\t"
00775         "walignr2ne wr11, wr13, wr14    \n\t"
00776         "wunpckelub wr4, wr6            \n\t"
00777         "wunpckehub wr5, wr6            \n\t"
00778         "wunpckelub wr6, wr7            \n\t"
00779         "wunpckehub wr7, wr7            \n\t"
00780         "wunpckelub wr8, wr10           \n\t"
00781         "wunpckehub wr9, wr10           \n\t"
00782         "wunpckelub wr10, wr11          \n\t"
00783         "wunpckehub wr11, wr11          \n\t"
00784         "waddhus wr4, wr4, wr8          \n\t"
00785         "waddhus wr5, wr5, wr9          \n\t"
00786         "waddhus wr6, wr6, wr10         \n\t"
00787         "waddhus wr7, wr7, wr11         \n\t"
00788         "waddhus wr8, wr0, wr4          \n\t"
00789         "waddhus wr9, wr1, wr5          \n\t"
00790         "waddhus wr10, wr2, wr6         \n\t"
00791         "waddhus wr11, wr3, wr7         \n\t"
00792         "waddhus wr8, wr8, wr15         \n\t"
00793         "waddhus wr9, wr9, wr15         \n\t"
00794         "waddhus wr10, wr10, wr15       \n\t"
00795         "waddhus wr11, wr11, wr15       \n\t"
00796         "wsrlhg wr8, wr8, wcgr0         \n\t"
00797         "wsrlhg wr9, wr9, wcgr0         \n\t"
00798         "wsrlhg wr10, wr10, wcgr0       \n\t"
00799         "wsrlhg wr11, wr11, wcgr0       \n\t"
00800         "wpackhus wr8, wr8, wr9         \n\t"
00801         "wpackhus wr9, wr10, wr11       \n\t"
00802         "wstrd wr8, [%[block]]          \n\t"
00803         "wstrd wr9, [%[block], #8]      \n\t"
00804         "add %[block], %[block], %[line_size]   \n\t"
00805 
00806         // [wr0 wr1 wr2 wr3] <= *
00807         // [wr4 wr5 wr6 wr7]
00808         "wldrd wr12, [%[pixels]]        \n\t"
00809         "wldrd wr13, [%[pixels], #8]    \n\t"
00810         "wldrd wr14, [%[pixels], #16]   \n\t"
00811         "add %[pixels], %[pixels], %[line_size] \n\t"
00812         "walignr1 wr2, wr12, wr13       \n\t"
00813         "pld [%[pixels]]                \n\t"
00814         "pld [%[pixels], #32]           \n\t"
00815         "walignr1 wr3, wr13, wr14       \n\t"
00816         "wmoveq wr10, wr13              \n\t"
00817         "wmoveq wr11, wr14              \n\t"
00818         "walignr2ne wr10, wr12, wr13    \n\t"
00819         "walignr2ne wr11, wr13, wr14    \n\t"
00820         "wunpckelub wr0, wr2            \n\t"
00821         "wunpckehub wr1, wr2            \n\t"
00822         "wunpckelub wr2, wr3            \n\t"
00823         "wunpckehub wr3, wr3            \n\t"
00824         "wunpckelub wr8, wr10           \n\t"
00825         "wunpckehub wr9, wr10           \n\t"
00826         "wunpckelub wr10, wr11          \n\t"
00827         "wunpckehub wr11, wr11          \n\t"
00828         "waddhus wr0, wr0, wr8          \n\t"
00829         "waddhus wr1, wr1, wr9          \n\t"
00830         "waddhus wr2, wr2, wr10         \n\t"
00831         "waddhus wr3, wr3, wr11         \n\t"
00832         "waddhus wr8, wr0, wr4          \n\t"
00833         "waddhus wr9, wr1, wr5          \n\t"
00834         "waddhus wr10, wr2, wr6         \n\t"
00835         "waddhus wr11, wr3, wr7         \n\t"
00836         "waddhus wr8, wr8, wr15         \n\t"
00837         "waddhus wr9, wr9, wr15         \n\t"
00838         "waddhus wr10, wr10, wr15       \n\t"
00839         "waddhus wr11, wr11, wr15       \n\t"
00840         "wsrlhg wr8, wr8, wcgr0         \n\t"
00841         "wsrlhg wr9, wr9, wcgr0         \n\t"
00842         "wsrlhg wr10, wr10, wcgr0       \n\t"
00843         "wsrlhg wr11, wr11, wcgr0       \n\t"
00844         "wpackhus wr8, wr8, wr9         \n\t"
00845         "wpackhus wr9, wr10, wr11       \n\t"
00846         "wstrd wr8, [%[block]]          \n\t"
00847         "wstrd wr9, [%[block], #8]      \n\t"
00848         "add %[block], %[block], %[line_size]   \n\t"
00849 
00850         "subs %[h], %[h], #2            \n\t"
00851         "bne 1b                         \n\t"
00852         : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
00853         : [line_size]"r"(line_size)
00854         : "r12", "memory");
00855 }
00856 
00857 void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00858 {
00859     // [wr0 wr1 wr2 wr3] for previous line
00860     // [wr4 wr5 wr6 wr7] for current line
00861     SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
00862     __asm__ volatile(
00863         "pld [%[block]]                 \n\t"
00864         "pld [%[block], #32]            \n\t"
00865         "pld [%[pixels]]                \n\t"
00866         "mov r12, #2                    \n\t"
00867         "pld [%[pixels], #32]           \n\t"
00868         "tmcr wcgr0, r12                \n\t" /* for shift value */
00869         "and r12, %[pixels], #7         \n\t"
00870         "bic %[pixels], %[pixels], #7   \n\t"
00871         "tmcr wcgr1, r12                \n\t"
00872 
00873         // [wr0 wr1 wr2 wr3] <= *
00874         // [wr4 wr5 wr6 wr7]
00875         "wldrd wr12, [%[pixels]]        \n\t"
00876         "add r12, r12, #1               \n\t"
00877         "wldrd wr13, [%[pixels], #8]    \n\t"
00878         "tmcr wcgr2, r12                \n\t"
00879         "add %[pixels], %[pixels], %[line_size] \n\t"
00880         "cmp r12, #8                    \n\t"
00881         "pld [%[pixels]]                \n\t"
00882         "pld [%[pixels], #32]           \n\t"
00883         "walignr1 wr2, wr12, wr13       \n\t"
00884         "wmoveq wr10, wr13              \n\t"
00885         "walignr2ne wr10, wr12, wr13    \n\t"
00886         "wunpckelub wr0, wr2            \n\t"
00887         "wunpckehub wr1, wr2            \n\t"
00888         "wunpckelub wr8, wr10           \n\t"
00889         "wunpckehub wr9, wr10           \n\t"
00890         "waddhus wr0, wr0, wr8          \n\t"
00891         "waddhus wr1, wr1, wr9          \n\t"
00892 
00893         "1:                             \n\t"
00894         // [wr0 wr1 wr2 wr3]
00895         // [wr4 wr5 wr6 wr7] <= *
00896         "wldrd wr12, [%[pixels]]        \n\t"
00897         "cmp r12, #8                    \n\t"
00898         "wldrd wr13, [%[pixels], #8]    \n\t"
00899         "add %[pixels], %[pixels], %[line_size] \n\t"
00900         "walignr1 wr6, wr12, wr13       \n\t"
00901         "pld [%[pixels]]                \n\t"
00902         "pld [%[pixels], #32]           \n\t"
00903         "wmoveq wr10, wr13              \n\t"
00904         "walignr2ne wr10, wr12, wr13    \n\t"
00905         "wunpckelub wr4, wr6            \n\t"
00906         "wunpckehub wr5, wr6            \n\t"
00907         "wunpckelub wr8, wr10           \n\t"
00908         "wunpckehub wr9, wr10           \n\t"
00909         "waddhus wr4, wr4, wr8          \n\t"
00910         "waddhus wr5, wr5, wr9          \n\t"
00911         "waddhus wr8, wr0, wr4          \n\t"
00912         "waddhus wr9, wr1, wr5          \n\t"
00913         "waddhus wr8, wr8, wr15         \n\t"
00914         "waddhus wr9, wr9, wr15         \n\t"
00915         "wldrd wr12, [%[block]]         \n\t"
00916         "wsrlhg wr8, wr8, wcgr0         \n\t"
00917         "wsrlhg wr9, wr9, wcgr0         \n\t"
00918         "wpackhus wr8, wr8, wr9         \n\t"
00919         WAVG2B" wr8, wr8, wr12          \n\t"
00920         "wstrd wr8, [%[block]]          \n\t"
00921         "add %[block], %[block], %[line_size]   \n\t"
00922         "wldrd wr12, [%[pixels]]        \n\t"
00923         "pld [%[block]]                 \n\t"
00924         "pld [%[block], #32]            \n\t"
00925 
00926         // [wr0 wr1 wr2 wr3] <= *
00927         // [wr4 wr5 wr6 wr7]
00928         "wldrd wr13, [%[pixels], #8]    \n\t"
00929         "add %[pixels], %[pixels], %[line_size] \n\t"
00930         "walignr1 wr2, wr12, wr13       \n\t"
00931         "pld [%[pixels]]                \n\t"
00932         "pld [%[pixels], #32]           \n\t"
00933         "wmoveq wr10, wr13              \n\t"
00934         "walignr2ne wr10, wr12, wr13    \n\t"
00935         "wunpckelub wr0, wr2            \n\t"
00936         "wunpckehub wr1, wr2            \n\t"
00937         "wunpckelub wr8, wr10           \n\t"
00938         "wunpckehub wr9, wr10           \n\t"
00939         "waddhus wr0, wr0, wr8          \n\t"
00940         "waddhus wr1, wr1, wr9          \n\t"
00941         "waddhus wr8, wr0, wr4          \n\t"
00942         "waddhus wr9, wr1, wr5          \n\t"
00943         "waddhus wr8, wr8, wr15         \n\t"
00944         "waddhus wr9, wr9, wr15         \n\t"
00945         "wldrd wr12, [%[block]]         \n\t"
00946         "wsrlhg wr8, wr8, wcgr0         \n\t"
00947         "wsrlhg wr9, wr9, wcgr0         \n\t"
00948         "wpackhus wr8, wr8, wr9         \n\t"
00949         "subs %[h], %[h], #2            \n\t"
00950         WAVG2B" wr8, wr8, wr12          \n\t"
00951         "wstrd wr8, [%[block]]          \n\t"
00952         "add %[block], %[block], %[line_size]   \n\t"
00953         "pld [%[block]]                 \n\t"
00954         "pld [%[block], #32]            \n\t"
00955         "bne 1b                         \n\t"
00956         : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
00957         : [line_size]"r"(line_size)
00958         : "r12", "memory");
00959 }
00960 
00961 void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
00962 {
00963     // [wr0 wr1 wr2 wr3] for previous line
00964     // [wr4 wr5 wr6 wr7] for current line
00965     SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
00966     __asm__ volatile(
00967         "pld [%[block]]                 \n\t"
00968         "pld [%[block], #32]            \n\t"
00969         "pld [%[pixels]]                \n\t"
00970         "mov r12, #2                    \n\t"
00971         "pld [%[pixels], #32]           \n\t"
00972         "tmcr wcgr0, r12                \n\t" /* for shift value */
00973         /* alignment */
00974         "and r12, %[pixels], #7         \n\t"
00975         "bic %[pixels], %[pixels], #7           \n\t"
00976         "tmcr wcgr1, r12                \n\t"
00977         "add r12, r12, #1               \n\t"
00978         "tmcr wcgr2, r12                \n\t"
00979 
00980         // [wr0 wr1 wr2 wr3] <= *
00981         // [wr4 wr5 wr6 wr7]
00982         "wldrd wr12, [%[pixels]]        \n\t"
00983         "cmp r12, #8                    \n\t"
00984         "wldrd wr13, [%[pixels], #8]    \n\t"
00985         "wldrd wr14, [%[pixels], #16]   \n\t"
00986         "add %[pixels], %[pixels], %[line_size] \n\t"
00987         "pld [%[pixels]]                \n\t"
00988         "walignr1 wr2, wr12, wr13       \n\t"
00989         "pld [%[pixels], #32]           \n\t"
00990         "walignr1 wr3, wr13, wr14       \n\t"
00991         "wmoveq wr10, wr13              \n\t"
00992         "wmoveq wr11, wr14              \n\t"
00993         "walignr2ne wr10, wr12, wr13    \n\t"
00994         "walignr2ne wr11, wr13, wr14    \n\t"
00995         "wunpckelub wr0, wr2            \n\t"
00996         "wunpckehub wr1, wr2            \n\t"
00997         "wunpckelub wr2, wr3            \n\t"
00998         "wunpckehub wr3, wr3            \n\t"
00999         "wunpckelub wr8, wr10           \n\t"
01000         "wunpckehub wr9, wr10           \n\t"
01001         "wunpckelub wr10, wr11          \n\t"
01002         "wunpckehub wr11, wr11          \n\t"
01003         "waddhus wr0, wr0, wr8          \n\t"
01004         "waddhus wr1, wr1, wr9          \n\t"
01005         "waddhus wr2, wr2, wr10         \n\t"
01006         "waddhus wr3, wr3, wr11         \n\t"
01007 
01008         "1:                             \n\t"
01009         // [wr0 wr1 wr2 wr3]
01010         // [wr4 wr5 wr6 wr7] <= *
01011         "wldrd wr12, [%[pixels]]        \n\t"
01012         "cmp r12, #8                    \n\t"
01013         "wldrd wr13, [%[pixels], #8]    \n\t"
01014         "wldrd wr14, [%[pixels], #16]   \n\t"
01015         "add %[pixels], %[pixels], %[line_size] \n\t"
01016         "walignr1 wr6, wr12, wr13       \n\t"
01017         "pld [%[pixels]]                \n\t"
01018         "pld [%[pixels], #32]           \n\t"
01019         "walignr1 wr7, wr13, wr14       \n\t"
01020         "wmoveq wr10, wr13              \n\t"
01021         "wmoveq wr11, wr14              \n\t"
01022         "walignr2ne wr10, wr12, wr13    \n\t"
01023         "walignr2ne wr11, wr13, wr14    \n\t"
01024         "wunpckelub wr4, wr6            \n\t"
01025         "wunpckehub wr5, wr6            \n\t"
01026         "wunpckelub wr6, wr7            \n\t"
01027         "wunpckehub wr7, wr7            \n\t"
01028         "wunpckelub wr8, wr10           \n\t"
01029         "wunpckehub wr9, wr10           \n\t"
01030         "wunpckelub wr10, wr11          \n\t"
01031         "wunpckehub wr11, wr11          \n\t"
01032         "waddhus wr4, wr4, wr8          \n\t"
01033         "waddhus wr5, wr5, wr9          \n\t"
01034         "waddhus wr6, wr6, wr10         \n\t"
01035         "waddhus wr7, wr7, wr11         \n\t"
01036         "waddhus wr8, wr0, wr4          \n\t"
01037         "waddhus wr9, wr1, wr5          \n\t"
01038         "waddhus wr10, wr2, wr6         \n\t"
01039         "waddhus wr11, wr3, wr7         \n\t"
01040         "waddhus wr8, wr8, wr15         \n\t"
01041         "waddhus wr9, wr9, wr15         \n\t"
01042         "waddhus wr10, wr10, wr15       \n\t"
01043         "waddhus wr11, wr11, wr15       \n\t"
01044         "wsrlhg wr8, wr8, wcgr0         \n\t"
01045         "wsrlhg wr9, wr9, wcgr0         \n\t"
01046         "wldrd wr12, [%[block]]         \n\t"
01047         "wldrd wr13, [%[block], #8]     \n\t"
01048         "wsrlhg wr10, wr10, wcgr0       \n\t"
01049         "wsrlhg wr11, wr11, wcgr0       \n\t"
01050         "wpackhus wr8, wr8, wr9         \n\t"
01051         "wpackhus wr9, wr10, wr11       \n\t"
01052         WAVG2B" wr8, wr8, wr12          \n\t"
01053         WAVG2B" wr9, wr9, wr13          \n\t"
01054         "wstrd wr8, [%[block]]          \n\t"
01055         "wstrd wr9, [%[block], #8]      \n\t"
01056         "add %[block], %[block], %[line_size]   \n\t"
01057 
01058         // [wr0 wr1 wr2 wr3] <= *
01059         // [wr4 wr5 wr6 wr7]
01060         "wldrd wr12, [%[pixels]]        \n\t"
01061         "pld [%[block]]                 \n\t"
01062         "wldrd wr13, [%[pixels], #8]    \n\t"
01063         "pld [%[block], #32]            \n\t"
01064         "wldrd wr14, [%[pixels], #16]   \n\t"
01065         "add %[pixels], %[pixels], %[line_size] \n\t"
01066         "walignr1 wr2, wr12, wr13       \n\t"
01067         "pld [%[pixels]]                \n\t"
01068         "pld [%[pixels], #32]           \n\t"
01069         "walignr1 wr3, wr13, wr14       \n\t"
01070         "wmoveq wr10, wr13              \n\t"
01071         "wmoveq wr11, wr14              \n\t"
01072         "walignr2ne wr10, wr12, wr13    \n\t"
01073         "walignr2ne wr11, wr13, wr14    \n\t"
01074         "wunpckelub wr0, wr2            \n\t"
01075         "wunpckehub wr1, wr2            \n\t"
01076         "wunpckelub wr2, wr3            \n\t"
01077         "wunpckehub wr3, wr3            \n\t"
01078         "wunpckelub wr8, wr10           \n\t"
01079         "wunpckehub wr9, wr10           \n\t"
01080         "wunpckelub wr10, wr11          \n\t"
01081         "wunpckehub wr11, wr11          \n\t"
01082         "waddhus wr0, wr0, wr8          \n\t"
01083         "waddhus wr1, wr1, wr9          \n\t"
01084         "waddhus wr2, wr2, wr10         \n\t"
01085         "waddhus wr3, wr3, wr11         \n\t"
01086         "waddhus wr8, wr0, wr4          \n\t"
01087         "waddhus wr9, wr1, wr5          \n\t"
01088         "waddhus wr10, wr2, wr6         \n\t"
01089         "waddhus wr11, wr3, wr7         \n\t"
01090         "waddhus wr8, wr8, wr15         \n\t"
01091         "waddhus wr9, wr9, wr15         \n\t"
01092         "waddhus wr10, wr10, wr15       \n\t"
01093         "waddhus wr11, wr11, wr15       \n\t"
01094         "wsrlhg wr8, wr8, wcgr0         \n\t"
01095         "wsrlhg wr9, wr9, wcgr0         \n\t"
01096         "wldrd wr12, [%[block]]         \n\t"
01097         "wldrd wr13, [%[block], #8]     \n\t"
01098         "wsrlhg wr10, wr10, wcgr0       \n\t"
01099         "wsrlhg wr11, wr11, wcgr0       \n\t"
01100         "wpackhus wr8, wr8, wr9         \n\t"
01101         "wpackhus wr9, wr10, wr11       \n\t"
01102         WAVG2B" wr8, wr8, wr12          \n\t"
01103         WAVG2B" wr9, wr9, wr13          \n\t"
01104         "wstrd wr8, [%[block]]          \n\t"
01105         "wstrd wr9, [%[block], #8]      \n\t"
01106         "add %[block], %[block], %[line_size]   \n\t"
01107         "subs %[h], %[h], #2            \n\t"
01108         "pld [%[block]]                 \n\t"
01109         "pld [%[block], #32]            \n\t"
01110         "bne 1b                         \n\t"
01111         : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
01112         : [line_size]"r"(line_size)
01113         : "r12", "memory");
01114 }

Generated on Fri Sep 16 2011 17:17:34 for FFmpeg by  doxygen 1.7.1