• Main Page
  • Related Pages
  • Modules
  • Data Structures
  • Files
  • File List
  • Globals

libswscale/ppc/yuv2rgb_altivec.c

Go to the documentation of this file.
00001 /*
00002  * AltiVec acceleration for colorspace conversion
00003  *
00004  * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
00005  *
00006  * This file is part of FFmpeg.
00007  *
00008  * FFmpeg is free software; you can redistribute it and/or
00009  * modify it under the terms of the GNU Lesser General Public
00010  * License as published by the Free Software Foundation; either
00011  * version 2.1 of the License, or (at your option) any later version.
00012  *
00013  * FFmpeg is distributed in the hope that it will be useful,
00014  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00016  * Lesser General Public License for more details.
00017  *
00018  * You should have received a copy of the GNU Lesser General Public
00019  * License along with FFmpeg; if not, write to the Free Software
00020  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00021  */
00022 
00023 /*
00024 Convert I420 YV12 to RGB in various formats,
00025   it rejects images that are not in 420 formats,
00026   it rejects images that don't have widths of multiples of 16,
00027   it rejects images that don't have heights of multiples of 2.
00028 Reject defers to C simulation code.
00029 
00030 Lots of optimizations to be done here.
00031 
00032 1. Need to fix saturation code. I just couldn't get it to fly with packs
00033    and adds, so we currently use max/min to clip.
00034 
00035 2. The inefficient use of chroma loading needs a bit of brushing up.
00036 
00037 3. Analysis of pipeline stalls needs to be done. Use shark to identify
00038    pipeline stalls.
00039 
00040 
00041 MODIFIED to calculate coeffs from currently selected color space.
00042 MODIFIED core to be a macro where you specify the output format.
00043 ADDED UYVY conversion which is never called due to some thing in swscale.
00044 CORRECTED algorithim selection to be strict on input formats.
00045 ADDED runtime detection of AltiVec.
00046 
00047 ADDED altivec_yuv2packedX vertical scl + RGB converter
00048 
00049 March 27,2004
00050 PERFORMANCE ANALYSIS
00051 
00052 The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
00053 used as test.
00054 The AltiVec version uses 10% of the processor or ~100Mips for D1 video
00055 same sequence.
00056 
00057 720 * 480 * 30  ~10MPS
00058 
00059 so we have roughly 10 clocks per pixel. This is too high, something has
00060 to be wrong.
00061 
00062 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
00063 need for vec_min.
00064 
00065 OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have
00066 the input video frame, it was just decompressed so it probably resides in L1
00067 caches. However, we are creating the output video stream. This needs to use the
00068 DSTST instruction to optimize for the cache. We couple this with the fact that
00069 we are not going to be visiting the input buffer again so we mark it Least
00070 Recently Used. This shaves 25% of the processor cycles off.
00071 
00072 Now memcpy is the largest mips consumer in the system, probably due
00073 to the inefficient X11 stuff.
00074 
00075 GL libraries seem to be very slow on this machine 1.33Ghz PB running
00076 Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
00077 a versioning issue, however I have libGL.1.2.dylib for both
00078 machines. (We need to figure this out now.)
00079 
00080 GL2 libraries work now with patch for RGB32.
00081 
00082 NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
00083 
00084 Integrated luma prescaling adjustment for saturation/contrast/brightness
00085 adjustment.
00086 */
00087 
00088 #include <stdio.h>
00089 #include <stdlib.h>
00090 #include <string.h>
00091 #include <inttypes.h>
00092 #include <assert.h>
00093 #include "config.h"
00094 #include "libswscale/rgb2rgb.h"
00095 #include "libswscale/swscale.h"
00096 #include "libswscale/swscale_internal.h"
00097 
00098 #undef PROFILE_THE_BEAST
00099 #undef INC_SCALING
00100 
00101 typedef unsigned char ubyte;
00102 typedef signed char   sbyte;
00103 
00104 
00105 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
00106    homogeneous vector registers x0,x1,x2 are interleaved with the
00107    following technique:
00108 
00109       o0 = vec_mergeh (x0,x1);
00110       o1 = vec_perm (o0, x2, perm_rgb_0);
00111       o2 = vec_perm (o0, x2, perm_rgb_1);
00112       o3 = vec_mergel (x0,x1);
00113       o4 = vec_perm (o3,o2,perm_rgb_2);
00114       o5 = vec_perm (o3,o2,perm_rgb_3);
00115 
00116   perm_rgb_0:   o0(RG).h v1(B) --> o1*
00117               0   1  2   3   4
00118              rgbr|gbrg|brgb|rgbr
00119              0010 0100 1001 0010
00120              0102 3145 2673 894A
00121 
00122   perm_rgb_1:   o0(RG).h v1(B) --> o2
00123               0   1  2   3   4
00124              gbrg|brgb|bbbb|bbbb
00125              0100 1001 1111 1111
00126              B5CD 6EF7 89AB CDEF
00127 
00128   perm_rgb_2:   o3(RG).l o2(rgbB.l) --> o4*
00129               0   1  2   3   4
00130              gbrg|brgb|rgbr|gbrg
00131              1111 1111 0010 0100
00132              89AB CDEF 0182 3945
00133 
00134   perm_rgb_2:   o3(RG).l o2(rgbB.l) ---> o5*
00135               0   1  2   3   4
00136              brgb|rgbr|gbrg|brgb
00137              1001 0010 0100 1001
00138              a67b 89cA BdCD eEFf
00139 
00140 */
00141 static
00142 const vector unsigned char
00143   perm_rgb_0 = {0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
00144                 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a},
00145   perm_rgb_1 = {0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
00146                 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f},
00147   perm_rgb_2 = {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
00148                 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05},
00149   perm_rgb_3 = {0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
00150                 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f};
00151 
00152 #define vec_merge3(x2,x1,x0,y0,y1,y2)       \
00153 do {                                        \
00154     __typeof__(x0) o0,o2,o3;                \
00155         o0 = vec_mergeh (x0,x1);            \
00156         y0 = vec_perm (o0, x2, perm_rgb_0); \
00157         o2 = vec_perm (o0, x2, perm_rgb_1); \
00158         o3 = vec_mergel (x0,x1);            \
00159         y1 = vec_perm (o3,o2,perm_rgb_2);   \
00160         y2 = vec_perm (o3,o2,perm_rgb_3);   \
00161 } while(0)
00162 
00163 #define vec_mstbgr24(x0,x1,x2,ptr)      \
00164 do {                                    \
00165     __typeof__(x0) _0,_1,_2;            \
00166     vec_merge3 (x0,x1,x2,_0,_1,_2);     \
00167     vec_st (_0, 0, ptr++);              \
00168     vec_st (_1, 0, ptr++);              \
00169     vec_st (_2, 0, ptr++);              \
00170 }  while (0)
00171 
00172 #define vec_mstrgb24(x0,x1,x2,ptr)      \
00173 do {                                    \
00174     __typeof__(x0) _0,_1,_2;            \
00175     vec_merge3 (x2,x1,x0,_0,_1,_2);     \
00176     vec_st (_0, 0, ptr++);              \
00177     vec_st (_1, 0, ptr++);              \
00178     vec_st (_2, 0, ptr++);              \
00179 }  while (0)
00180 
00181 /* pack the pixels in rgb0 format
00182    msb R
00183    lsb 0
00184 */
00185 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr)                                       \
00186 do {                                                                          \
00187     T _0,_1,_2,_3;                                                            \
00188     _0 = vec_mergeh (x0,x1);                                                  \
00189     _1 = vec_mergeh (x2,x3);                                                  \
00190     _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
00191     _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
00192     vec_st (_2, 0*16, (T *)ptr);                                              \
00193     vec_st (_3, 1*16, (T *)ptr);                                              \
00194     _0 = vec_mergel (x0,x1);                                                  \
00195     _1 = vec_mergel (x2,x3);                                                  \
00196     _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
00197     _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
00198     vec_st (_2, 2*16, (T *)ptr);                                              \
00199     vec_st (_3, 3*16, (T *)ptr);                                              \
00200     ptr += 4;                                                                 \
00201 }  while (0)
00202 
00203 /*
00204 
00205   | 1     0       1.4021   | | Y |
00206   | 1    -0.3441 -0.7142   |x| Cb|
00207   | 1     1.7718  0        | | Cr|
00208 
00209 
00210   Y:      [-128 127]
00211   Cb/Cr : [-128 127]
00212 
00213   typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
00214 
00215 */
00216 
00217 
00218 
00219 
00220 #define vec_unh(x) \
00221     (vector signed short) \
00222         vec_perm(x,(__typeof__(x)){0}, \
00223                  ((vector unsigned char){0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
00224                                          0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07}))
00225 #define vec_unl(x) \
00226     (vector signed short) \
00227         vec_perm(x,(__typeof__(x)){0}, \
00228                  ((vector unsigned char){0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
00229                                          0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F}))
00230 
00231 #define vec_clip_s16(x) \
00232     vec_max (vec_min (x, ((vector signed short){235,235,235,235,235,235,235,235})), \
00233                          ((vector signed short){ 16, 16, 16, 16, 16, 16, 16, 16}))
00234 
00235 #define vec_packclp(x,y) \
00236     (vector unsigned char)vec_packs \
00237         ((vector unsigned short)vec_max (x,((vector signed short) {0})), \
00238          (vector unsigned short)vec_max (y,((vector signed short) {0})))
00239 
00240 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,a,a,ptr)
00241 
00242 
00243 static inline void cvtyuvtoRGB (SwsContext *c,
00244                                 vector signed short Y, vector signed short U, vector signed short V,
00245                                 vector signed short *R, vector signed short *G, vector signed short *B)
00246 {
00247     vector signed   short vx,ux,uvx;
00248 
00249     Y = vec_mradds (Y, c->CY, c->OY);
00250     U  = vec_sub (U,(vector signed short)
00251                     vec_splat((vector signed short){128},0));
00252     V  = vec_sub (V,(vector signed short)
00253                     vec_splat((vector signed short){128},0));
00254 
00255     //   ux  = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
00256     ux = vec_sl (U, c->CSHIFT);
00257     *B = vec_mradds (ux, c->CBU, Y);
00258 
00259     // vx  = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
00260     vx = vec_sl (V, c->CSHIFT);
00261     *R = vec_mradds (vx, c->CRV, Y);
00262 
00263     // uvx = ((CGU*u) + (CGV*v))>>15;
00264     uvx = vec_mradds (U, c->CGU, Y);
00265     *G  = vec_mradds (V, c->CGV, uvx);
00266 }
00267 
00268 
00269 /*
00270   ------------------------------------------------------------------------------
00271   CS converters
00272   ------------------------------------------------------------------------------
00273 */
00274 
00275 
00276 #define DEFCSP420_CVT(name,out_pixels)                                  \
00277 static int altivec_##name (SwsContext *c,                               \
00278                            unsigned char **in, int *instrides,          \
00279                            int srcSliceY,        int srcSliceH,         \
00280                            unsigned char **oplanes, int *outstrides)    \
00281 {                                                                       \
00282     int w = c->srcW;                                                    \
00283     int h = srcSliceH;                                                  \
00284     int i,j;                                                            \
00285     int instrides_scl[3];                                               \
00286     vector unsigned char y0,y1;                                         \
00287                                                                         \
00288     vector signed char  u,v;                                            \
00289                                                                         \
00290     vector signed short Y0,Y1,Y2,Y3;                                    \
00291     vector signed short U,V;                                            \
00292     vector signed short vx,ux,uvx;                                      \
00293     vector signed short vx0,ux0,uvx0;                                   \
00294     vector signed short vx1,ux1,uvx1;                                   \
00295     vector signed short R0,G0,B0;                                       \
00296     vector signed short R1,G1,B1;                                       \
00297     vector unsigned char R,G,B;                                         \
00298                                                                         \
00299     vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP;                  \
00300     vector unsigned char align_perm;                                    \
00301                                                                         \
00302     vector signed short                                                 \
00303         lCY  = c->CY,                                                   \
00304         lOY  = c->OY,                                                   \
00305         lCRV = c->CRV,                                                  \
00306         lCBU = c->CBU,                                                  \
00307         lCGU = c->CGU,                                                  \
00308         lCGV = c->CGV;                                                  \
00309                                                                         \
00310     vector unsigned short lCSHIFT = c->CSHIFT;                          \
00311                                                                         \
00312     ubyte *y1i   = in[0];                                               \
00313     ubyte *y2i   = in[0]+instrides[0];                                  \
00314     ubyte *ui    = in[1];                                               \
00315     ubyte *vi    = in[2];                                               \
00316                                                                         \
00317     vector unsigned char *oute                                          \
00318         = (vector unsigned char *)                                      \
00319             (oplanes[0]+srcSliceY*outstrides[0]);                       \
00320     vector unsigned char *outo                                          \
00321         = (vector unsigned char *)                                      \
00322             (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);         \
00323                                                                         \
00324                                                                         \
00325     instrides_scl[0] = instrides[0]*2-w;  /* the loop moves y{1,2}i by w */ \
00326     instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */    \
00327     instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */    \
00328                                                                         \
00329                                                                         \
00330     for (i=0;i<h/2;i++) {                                               \
00331         vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);          \
00332         vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);          \
00333                                                                         \
00334         for (j=0;j<w/16;j++) {                                          \
00335                                                                         \
00336             y1ivP = (vector unsigned char *)y1i;                        \
00337             y2ivP = (vector unsigned char *)y2i;                        \
00338             uivP  = (vector unsigned char *)ui;                         \
00339             vivP  = (vector unsigned char *)vi;                         \
00340                                                                         \
00341             align_perm = vec_lvsl (0, y1i);                             \
00342             y0 = (vector unsigned char)                                 \
00343                  vec_perm (y1ivP[0], y1ivP[1], align_perm);             \
00344                                                                         \
00345             align_perm = vec_lvsl (0, y2i);                             \
00346             y1 = (vector unsigned char)                                 \
00347                  vec_perm (y2ivP[0], y2ivP[1], align_perm);             \
00348                                                                         \
00349             align_perm = vec_lvsl (0, ui);                              \
00350             u = (vector signed char)                                    \
00351                 vec_perm (uivP[0], uivP[1], align_perm);                \
00352                                                                         \
00353             align_perm = vec_lvsl (0, vi);                              \
00354             v = (vector signed char)                                    \
00355                 vec_perm (vivP[0], vivP[1], align_perm);                \
00356                                                                         \
00357             u  = (vector signed char)                                   \
00358                  vec_sub (u,(vector signed char)                        \
00359                           vec_splat((vector signed char){128},0));      \
00360             v  = (vector signed char)                                   \
00361                  vec_sub (v,(vector signed char)                        \
00362                           vec_splat((vector signed char){128},0));      \
00363                                                                         \
00364             U  = vec_unpackh (u);                                       \
00365             V  = vec_unpackh (v);                                       \
00366                                                                         \
00367                                                                         \
00368             Y0 = vec_unh (y0);                                          \
00369             Y1 = vec_unl (y0);                                          \
00370             Y2 = vec_unh (y1);                                          \
00371             Y3 = vec_unl (y1);                                          \
00372                                                                         \
00373             Y0 = vec_mradds (Y0, lCY, lOY);                             \
00374             Y1 = vec_mradds (Y1, lCY, lOY);                             \
00375             Y2 = vec_mradds (Y2, lCY, lOY);                             \
00376             Y3 = vec_mradds (Y3, lCY, lOY);                             \
00377                                                                         \
00378             /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                  \
00379             ux = vec_sl (U, lCSHIFT);                                   \
00380             ux = vec_mradds (ux, lCBU, (vector signed short){0});       \
00381             ux0  = vec_mergeh (ux,ux);                                  \
00382             ux1  = vec_mergel (ux,ux);                                  \
00383                                                                         \
00384             /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */            \
00385             vx = vec_sl (V, lCSHIFT);                                   \
00386             vx = vec_mradds (vx, lCRV, (vector signed short){0});       \
00387             vx0  = vec_mergeh (vx,vx);                                  \
00388             vx1  = vec_mergel (vx,vx);                                  \
00389                                                                         \
00390             /* uvx = ((CGU*u) + (CGV*v))>>15 */                         \
00391             uvx = vec_mradds (U, lCGU, (vector signed short){0});       \
00392             uvx = vec_mradds (V, lCGV, uvx);                            \
00393             uvx0 = vec_mergeh (uvx,uvx);                                \
00394             uvx1 = vec_mergel (uvx,uvx);                                \
00395                                                                         \
00396             R0 = vec_add (Y0,vx0);                                      \
00397             G0 = vec_add (Y0,uvx0);                                     \
00398             B0 = vec_add (Y0,ux0);                                      \
00399             R1 = vec_add (Y1,vx1);                                      \
00400             G1 = vec_add (Y1,uvx1);                                     \
00401             B1 = vec_add (Y1,ux1);                                      \
00402                                                                         \
00403             R  = vec_packclp (R0,R1);                                   \
00404             G  = vec_packclp (G0,G1);                                   \
00405             B  = vec_packclp (B0,B1);                                   \
00406                                                                         \
00407             out_pixels(R,G,B,oute);                                     \
00408                                                                         \
00409             R0 = vec_add (Y2,vx0);                                      \
00410             G0 = vec_add (Y2,uvx0);                                     \
00411             B0 = vec_add (Y2,ux0);                                      \
00412             R1 = vec_add (Y3,vx1);                                      \
00413             G1 = vec_add (Y3,uvx1);                                     \
00414             B1 = vec_add (Y3,ux1);                                      \
00415             R  = vec_packclp (R0,R1);                                   \
00416             G  = vec_packclp (G0,G1);                                   \
00417             B  = vec_packclp (B0,B1);                                   \
00418                                                                         \
00419                                                                         \
00420             out_pixels(R,G,B,outo);                                     \
00421                                                                         \
00422             y1i  += 16;                                                 \
00423             y2i  += 16;                                                 \
00424             ui   += 8;                                                  \
00425             vi   += 8;                                                  \
00426                                                                         \
00427         }                                                               \
00428                                                                         \
00429         outo  += (outstrides[0])>>4;                                    \
00430         oute  += (outstrides[0])>>4;                                    \
00431                                                                         \
00432         ui    += instrides_scl[1];                                      \
00433         vi    += instrides_scl[2];                                      \
00434         y1i   += instrides_scl[0];                                      \
00435         y2i   += instrides_scl[0];                                      \
00436     }                                                                   \
00437     return srcSliceH;                                                   \
00438 }
00439 
00440 
00441 #define out_abgr(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),c,b,a,ptr)
00442 #define out_bgra(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),c,b,a,((__typeof__ (a)){255}),ptr)
00443 #define out_rgba(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),a,b,c,((__typeof__ (a)){255}),ptr)
00444 #define out_argb(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,b,c,ptr)
00445 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
00446 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
00447 
00448 DEFCSP420_CVT (yuv2_abgr, out_abgr)
00449 #if 1
00450 DEFCSP420_CVT (yuv2_bgra, out_bgra)
00451 #else
00452 static int altivec_yuv2_bgra32 (SwsContext *c,
00453                                 unsigned char **in, int *instrides,
00454                                 int srcSliceY,        int srcSliceH,
00455                                 unsigned char **oplanes, int *outstrides)
00456 {
00457     int w = c->srcW;
00458     int h = srcSliceH;
00459     int i,j;
00460     int instrides_scl[3];
00461     vector unsigned char y0,y1;
00462 
00463     vector signed char  u,v;
00464 
00465     vector signed short Y0,Y1,Y2,Y3;
00466     vector signed short U,V;
00467     vector signed short vx,ux,uvx;
00468     vector signed short vx0,ux0,uvx0;
00469     vector signed short vx1,ux1,uvx1;
00470     vector signed short R0,G0,B0;
00471     vector signed short R1,G1,B1;
00472     vector unsigned char R,G,B;
00473 
00474     vector unsigned char *uivP, *vivP;
00475     vector unsigned char align_perm;
00476 
00477     vector signed short
00478         lCY  = c->CY,
00479         lOY  = c->OY,
00480         lCRV = c->CRV,
00481         lCBU = c->CBU,
00482         lCGU = c->CGU,
00483         lCGV = c->CGV;
00484 
00485     vector unsigned short lCSHIFT = c->CSHIFT;
00486 
00487     ubyte *y1i   = in[0];
00488     ubyte *y2i   = in[0]+w;
00489     ubyte *ui    = in[1];
00490     ubyte *vi    = in[2];
00491 
00492     vector unsigned char *oute
00493         = (vector unsigned char *)
00494           (oplanes[0]+srcSliceY*outstrides[0]);
00495     vector unsigned char *outo
00496         = (vector unsigned char *)
00497           (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);
00498 
00499 
00500     instrides_scl[0] = instrides[0];
00501     instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */
00502     instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */
00503 
00504 
00505     for (i=0;i<h/2;i++) {
00506         vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);
00507         vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);
00508 
00509         for (j=0;j<w/16;j++) {
00510 
00511             y0 = vec_ldl (0,y1i);
00512             y1 = vec_ldl (0,y2i);
00513             uivP = (vector unsigned char *)ui;
00514             vivP = (vector unsigned char *)vi;
00515 
00516             align_perm = vec_lvsl (0, ui);
00517             u  = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);
00518 
00519             align_perm = vec_lvsl (0, vi);
00520             v  = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
00521             u  = (vector signed char)
00522                  vec_sub (u,(vector signed char)
00523                           vec_splat((vector signed char){128},0));
00524 
00525             v  = (vector signed char)
00526                  vec_sub (v, (vector signed char)
00527                           vec_splat((vector signed char){128},0));
00528 
00529             U  = vec_unpackh (u);
00530             V  = vec_unpackh (v);
00531 
00532 
00533             Y0 = vec_unh (y0);
00534             Y1 = vec_unl (y0);
00535             Y2 = vec_unh (y1);
00536             Y3 = vec_unl (y1);
00537 
00538             Y0 = vec_mradds (Y0, lCY, lOY);
00539             Y1 = vec_mradds (Y1, lCY, lOY);
00540             Y2 = vec_mradds (Y2, lCY, lOY);
00541             Y3 = vec_mradds (Y3, lCY, lOY);
00542 
00543             /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */
00544             ux = vec_sl (U, lCSHIFT);
00545             ux = vec_mradds (ux, lCBU, (vector signed short){0});
00546             ux0  = vec_mergeh (ux,ux);
00547             ux1  = vec_mergel (ux,ux);
00548 
00549             /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */
00550             vx = vec_sl (V, lCSHIFT);
00551             vx = vec_mradds (vx, lCRV, (vector signed short){0});
00552             vx0  = vec_mergeh (vx,vx);
00553             vx1  = vec_mergel (vx,vx);
00554             /* uvx = ((CGU*u) + (CGV*v))>>15 */
00555             uvx = vec_mradds (U, lCGU, (vector signed short){0});
00556             uvx = vec_mradds (V, lCGV, uvx);
00557             uvx0 = vec_mergeh (uvx,uvx);
00558             uvx1 = vec_mergel (uvx,uvx);
00559             R0 = vec_add (Y0,vx0);
00560             G0 = vec_add (Y0,uvx0);
00561             B0 = vec_add (Y0,ux0);
00562             R1 = vec_add (Y1,vx1);
00563             G1 = vec_add (Y1,uvx1);
00564             B1 = vec_add (Y1,ux1);
00565             R  = vec_packclp (R0,R1);
00566             G  = vec_packclp (G0,G1);
00567             B  = vec_packclp (B0,B1);
00568 
00569             out_argb(R,G,B,oute);
00570             R0 = vec_add (Y2,vx0);
00571             G0 = vec_add (Y2,uvx0);
00572             B0 = vec_add (Y2,ux0);
00573             R1 = vec_add (Y3,vx1);
00574             G1 = vec_add (Y3,uvx1);
00575             B1 = vec_add (Y3,ux1);
00576             R  = vec_packclp (R0,R1);
00577             G  = vec_packclp (G0,G1);
00578             B  = vec_packclp (B0,B1);
00579 
00580             out_argb(R,G,B,outo);
00581             y1i  += 16;
00582             y2i  += 16;
00583             ui   += 8;
00584             vi   += 8;
00585 
00586         }
00587 
00588         outo  += (outstrides[0])>>4;
00589         oute  += (outstrides[0])>>4;
00590 
00591         ui    += instrides_scl[1];
00592         vi    += instrides_scl[2];
00593         y1i   += instrides_scl[0];
00594         y2i   += instrides_scl[0];
00595     }
00596     return srcSliceH;
00597 }
00598 
00599 #endif
00600 
00601 
00602 DEFCSP420_CVT (yuv2_rgba, out_rgba)
00603 DEFCSP420_CVT (yuv2_argb, out_argb)
00604 DEFCSP420_CVT (yuv2_rgb24,  out_rgb24)
00605 DEFCSP420_CVT (yuv2_bgr24,  out_bgr24)
00606 
00607 
00608 // uyvy|uyvy|uyvy|uyvy
00609 // 0123 4567 89ab cdef
00610 static
00611 const vector unsigned char
00612     demux_u = {0x10,0x00,0x10,0x00,
00613                0x10,0x04,0x10,0x04,
00614                0x10,0x08,0x10,0x08,
00615                0x10,0x0c,0x10,0x0c},
00616     demux_v = {0x10,0x02,0x10,0x02,
00617                0x10,0x06,0x10,0x06,
00618                0x10,0x0A,0x10,0x0A,
00619                0x10,0x0E,0x10,0x0E},
00620     demux_y = {0x10,0x01,0x10,0x03,
00621                0x10,0x05,0x10,0x07,
00622                0x10,0x09,0x10,0x0B,
00623                0x10,0x0D,0x10,0x0F};
00624 
00625 /*
00626   this is so I can play live CCIR raw video
00627 */
00628 static int altivec_uyvy_rgb32 (SwsContext *c,
00629                                unsigned char **in, int *instrides,
00630                                int srcSliceY,        int srcSliceH,
00631                                unsigned char **oplanes, int *outstrides)
00632 {
00633     int w = c->srcW;
00634     int h = srcSliceH;
00635     int i,j;
00636     vector unsigned char uyvy;
00637     vector signed   short Y,U,V;
00638     vector signed   short R0,G0,B0,R1,G1,B1;
00639     vector unsigned char  R,G,B;
00640     vector unsigned char *out;
00641     ubyte *img;
00642 
00643     img = in[0];
00644     out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
00645 
00646     for (i=0;i<h;i++) {
00647         for (j=0;j<w/16;j++) {
00648             uyvy = vec_ld (0, img);
00649             U = (vector signed short)
00650                 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
00651 
00652             V = (vector signed short)
00653                 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
00654 
00655             Y = (vector signed short)
00656                 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
00657 
00658             cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
00659 
00660             uyvy = vec_ld (16, img);
00661             U = (vector signed short)
00662                 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
00663 
00664             V = (vector signed short)
00665                 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
00666 
00667             Y = (vector signed short)
00668                 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
00669 
00670             cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
00671 
00672             R  = vec_packclp (R0,R1);
00673             G  = vec_packclp (G0,G1);
00674             B  = vec_packclp (B0,B1);
00675 
00676             //      vec_mstbgr24 (R,G,B, out);
00677             out_rgba (R,G,B,out);
00678 
00679             img += 32;
00680         }
00681     }
00682     return srcSliceH;
00683 }
00684 
00685 
00686 
00687 /* Ok currently the acceleration routine only supports
00688    inputs of widths a multiple of 16
00689    and heights a multiple 2
00690 
00691    So we just fall back to the C codes for this.
00692 */
00693 SwsFunc ff_yuv2rgb_init_altivec(SwsContext *c)
00694 {
00695     if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
00696         return NULL;
00697 
00698     /*
00699       and this seems not to matter too much I tried a bunch of
00700       videos with abnormal widths and MPlayer crashes elsewhere.
00701       mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
00702       boom with X11 bad match.
00703 
00704     */
00705     if ((c->srcW & 0xf) != 0)    return NULL;
00706 
00707     switch (c->srcFormat) {
00708     case PIX_FMT_YUV410P:
00709     case PIX_FMT_YUV420P:
00710     /*case IMGFMT_CLPL:        ??? */
00711     case PIX_FMT_GRAY8:
00712     case PIX_FMT_NV12:
00713     case PIX_FMT_NV21:
00714         if ((c->srcH & 0x1) != 0)
00715             return NULL;
00716 
00717         switch(c->dstFormat) {
00718         case PIX_FMT_RGB24:
00719             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
00720             return altivec_yuv2_rgb24;
00721         case PIX_FMT_BGR24:
00722             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
00723             return altivec_yuv2_bgr24;
00724         case PIX_FMT_ARGB:
00725             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
00726             return altivec_yuv2_argb;
00727         case PIX_FMT_ABGR:
00728             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
00729             return altivec_yuv2_abgr;
00730         case PIX_FMT_RGBA:
00731             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
00732             return altivec_yuv2_rgba;
00733         case PIX_FMT_BGRA:
00734             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
00735             return altivec_yuv2_bgra;
00736         default: return NULL;
00737         }
00738         break;
00739 
00740     case PIX_FMT_UYVY422:
00741         switch(c->dstFormat) {
00742         case PIX_FMT_BGR32:
00743             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
00744             return altivec_uyvy_rgb32;
00745         default: return NULL;
00746         }
00747         break;
00748 
00749     }
00750     return NULL;
00751 }
00752 
00753 void ff_yuv2rgb_init_tables_altivec(SwsContext *c, const int inv_table[4], int brightness, int contrast, int saturation)
00754 {
00755     union {
00756         DECLARE_ALIGNED(16, signed short, tmp)[8];
00757         vector signed short vec;
00758     } buf;
00759 
00760     buf.tmp[0] =  ((0xffffLL) * contrast>>8)>>9;                        //cy
00761     buf.tmp[1] =  -256*brightness;                                      //oy
00762     buf.tmp[2] =  (inv_table[0]>>3) *(contrast>>16)*(saturation>>16);   //crv
00763     buf.tmp[3] =  (inv_table[1]>>3) *(contrast>>16)*(saturation>>16);   //cbu
00764     buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16));  //cgu
00765     buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16));  //cgv
00766 
00767 
00768     c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
00769     c->CY   = vec_splat ((vector signed short)buf.vec, 0);
00770     c->OY   = vec_splat ((vector signed short)buf.vec, 1);
00771     c->CRV  = vec_splat ((vector signed short)buf.vec, 2);
00772     c->CBU  = vec_splat ((vector signed short)buf.vec, 3);
00773     c->CGU  = vec_splat ((vector signed short)buf.vec, 4);
00774     c->CGV  = vec_splat ((vector signed short)buf.vec, 5);
00775     return;
00776 }
00777 
00778 
00779 void
00780 ff_yuv2packedX_altivec(SwsContext *c,
00781                        const int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
00782                        const int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
00783                      uint8_t *dest, int dstW, int dstY)
00784 {
00785     int i,j;
00786     vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
00787     vector signed short R0,G0,B0,R1,G1,B1;
00788 
00789     vector unsigned char R,G,B;
00790     vector unsigned char *out,*nout;
00791 
00792     vector signed short   RND = vec_splat_s16(1<<3);
00793     vector unsigned short SCL = vec_splat_u16(4);
00794     DECLARE_ALIGNED(16, unsigned long, scratch)[16];
00795 
00796     vector signed short *YCoeffs, *CCoeffs;
00797 
00798     YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
00799     CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
00800 
00801     out = (vector unsigned char *)dest;
00802 
00803     for (i=0; i<dstW; i+=16) {
00804         Y0 = RND;
00805         Y1 = RND;
00806         /* extract 16 coeffs from lumSrc */
00807         for (j=0; j<lumFilterSize; j++) {
00808             X0 = vec_ld (0,  &lumSrc[j][i]);
00809             X1 = vec_ld (16, &lumSrc[j][i]);
00810             Y0 = vec_mradds (X0, YCoeffs[j], Y0);
00811             Y1 = vec_mradds (X1, YCoeffs[j], Y1);
00812         }
00813 
00814         U = RND;
00815         V = RND;
00816         /* extract 8 coeffs from U,V */
00817         for (j=0; j<chrFilterSize; j++) {
00818             X  = vec_ld (0, &chrSrc[j][i/2]);
00819             U  = vec_mradds (X, CCoeffs[j], U);
00820             X  = vec_ld (0, &chrSrc[j][i/2+2048]);
00821             V  = vec_mradds (X, CCoeffs[j], V);
00822         }
00823 
00824         /* scale and clip signals */
00825         Y0 = vec_sra (Y0, SCL);
00826         Y1 = vec_sra (Y1, SCL);
00827         U  = vec_sra (U,  SCL);
00828         V  = vec_sra (V,  SCL);
00829 
00830         Y0 = vec_clip_s16 (Y0);
00831         Y1 = vec_clip_s16 (Y1);
00832         U  = vec_clip_s16 (U);
00833         V  = vec_clip_s16 (V);
00834 
00835         /* now we have
00836           Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
00837           U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
00838 
00839           Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
00840           U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
00841           V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
00842         */
00843 
00844         U0 = vec_mergeh (U,U);
00845         V0 = vec_mergeh (V,V);
00846 
00847         U1 = vec_mergel (U,U);
00848         V1 = vec_mergel (V,V);
00849 
00850         cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
00851         cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
00852 
00853         R  = vec_packclp (R0,R1);
00854         G  = vec_packclp (G0,G1);
00855         B  = vec_packclp (B0,B1);
00856 
00857         switch(c->dstFormat) {
00858         case PIX_FMT_ABGR:  out_abgr  (R,G,B,out); break;
00859         case PIX_FMT_BGRA:  out_bgra  (R,G,B,out); break;
00860         case PIX_FMT_RGBA:  out_rgba  (R,G,B,out); break;
00861         case PIX_FMT_ARGB:  out_argb  (R,G,B,out); break;
00862         case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
00863         case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
00864         default:
00865             {
00866                 /* If this is reached, the caller should have called yuv2packedXinC
00867                    instead. */
00868                 static int printed_error_message;
00869                 if (!printed_error_message) {
00870                     av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
00871                            sws_format_name(c->dstFormat));
00872                     printed_error_message=1;
00873                 }
00874                 return;
00875             }
00876         }
00877     }
00878 
00879     if (i < dstW) {
00880         i -= 16;
00881 
00882         Y0 = RND;
00883         Y1 = RND;
00884         /* extract 16 coeffs from lumSrc */
00885         for (j=0; j<lumFilterSize; j++) {
00886             X0 = vec_ld (0,  &lumSrc[j][i]);
00887             X1 = vec_ld (16, &lumSrc[j][i]);
00888             Y0 = vec_mradds (X0, YCoeffs[j], Y0);
00889             Y1 = vec_mradds (X1, YCoeffs[j], Y1);
00890         }
00891 
00892         U = RND;
00893         V = RND;
00894         /* extract 8 coeffs from U,V */
00895         for (j=0; j<chrFilterSize; j++) {
00896             X  = vec_ld (0, &chrSrc[j][i/2]);
00897             U  = vec_mradds (X, CCoeffs[j], U);
00898             X  = vec_ld (0, &chrSrc[j][i/2+2048]);
00899             V  = vec_mradds (X, CCoeffs[j], V);
00900         }
00901 
00902         /* scale and clip signals */
00903         Y0 = vec_sra (Y0, SCL);
00904         Y1 = vec_sra (Y1, SCL);
00905         U  = vec_sra (U,  SCL);
00906         V  = vec_sra (V,  SCL);
00907 
00908         Y0 = vec_clip_s16 (Y0);
00909         Y1 = vec_clip_s16 (Y1);
00910         U  = vec_clip_s16 (U);
00911         V  = vec_clip_s16 (V);
00912 
00913         /* now we have
00914            Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
00915            U = u0 u1 u2 u3 u4 u5 u6 u7     V = v0 v1 v2 v3 v4 v5 v6 v7
00916 
00917            Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
00918            U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
00919            V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
00920         */
00921 
00922         U0 = vec_mergeh (U,U);
00923         V0 = vec_mergeh (V,V);
00924 
00925         U1 = vec_mergel (U,U);
00926         V1 = vec_mergel (V,V);
00927 
00928         cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
00929         cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
00930 
00931         R  = vec_packclp (R0,R1);
00932         G  = vec_packclp (G0,G1);
00933         B  = vec_packclp (B0,B1);
00934 
00935         nout = (vector unsigned char *)scratch;
00936         switch(c->dstFormat) {
00937         case PIX_FMT_ABGR:  out_abgr  (R,G,B,nout); break;
00938         case PIX_FMT_BGRA:  out_bgra  (R,G,B,nout); break;
00939         case PIX_FMT_RGBA:  out_rgba  (R,G,B,nout); break;
00940         case PIX_FMT_ARGB:  out_argb  (R,G,B,nout); break;
00941         case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
00942         case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
00943         default:
00944             /* Unreachable, I think. */
00945             av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
00946                    sws_format_name(c->dstFormat));
00947             return;
00948         }
00949 
00950         memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
00951     }
00952 
00953 }

Generated on Fri Sep 16 2011 17:17:52 for FFmpeg by  doxygen 1.7.1