ffmpeg / postproc / rgb2rgb_template.c @ d8dad2a5
History  View  Annotate  Download (47.4 KB)
1 
/*


2 
*

3 
* rgb2rgb.c, Software RGB to RGB convertor

4 
* pluralize by Software PAL8 to RGB convertor

5 
* Software YUV to YUV convertor

6 
* Software YUV to RGB convertor

7 
* Written by Nick Kurshev.

8 
* palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)

9 
*/

10  
11 
#include <stddef.h> 
12 
#include <inttypes.h> /* for __WORDSIZE */ 
13  
14 
#ifndef __WORDSIZE

15 
#warning You have misconfigured system and probably will lose performance!

16 
#endif

17  
18 
#undef PREFETCH

19 
#undef MOVNTQ

20 
#undef EMMS

21 
#undef SFENCE

22 
#undef MMREG_SIZE

23 
#undef PREFETCHW

24 
#undef PAVGB

25  
26 
#ifdef HAVE_SSE2

27 
#define MMREG_SIZE 16 
28 
#else

29 
#define MMREG_SIZE 8 
30 
#endif

31  
32 
#ifdef HAVE_3DNOW

33 
#define PREFETCH "prefetch" 
34 
#define PREFETCHW "prefetchw" 
35 
#define PAVGB "pavgusb" 
36 
#elif defined ( HAVE_MMX2 )

37 
#define PREFETCH "prefetchnta" 
38 
#define PREFETCHW "prefetcht0" 
39 
#define PAVGB "pavgb" 
40 
#else

41 
#define PREFETCH "/nop" 
42 
#define PREFETCHW "/nop" 
43 
#endif

44  
45 
#ifdef HAVE_3DNOW

46 
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */

47 
#define EMMS "femms" 
48 
#else

49 
#define EMMS "emms" 
50 
#endif

51  
52 
#ifdef HAVE_MMX2

53 
#define MOVNTQ "movntq" 
54 
#define SFENCE "sfence" 
55 
#else

56 
#define MOVNTQ "movq" 
57 
#define SFENCE "/nop" 
58 
#endif

59  
60 
static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size) 
61 
{ 
62 
uint8_t *dest = dst; 
63 
const uint8_t *s = src;

64 
const uint8_t *end;

65 
#ifdef HAVE_MMX

66 
const uint8_t *mm_end;

67 
#endif

68 
end = s + src_size; 
69 
#ifdef HAVE_MMX

70 
__asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); 
71 
mm_end = end  23;

72 
__asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory"); 
73 
while(s < mm_end)

74 
{ 
75 
__asm __volatile( 
76 
PREFETCH" 32%1\n\t"

77 
"movd %1, %%mm0\n\t"

78 
"punpckldq 3%1, %%mm0\n\t"

79 
"movd 6%1, %%mm1\n\t"

80 
"punpckldq 9%1, %%mm1\n\t"

81 
"movd 12%1, %%mm2\n\t"

82 
"punpckldq 15%1, %%mm2\n\t"

83 
"movd 18%1, %%mm3\n\t"

84 
"punpckldq 21%1, %%mm3\n\t"

85 
"pand %%mm7, %%mm0\n\t"

86 
"pand %%mm7, %%mm1\n\t"

87 
"pand %%mm7, %%mm2\n\t"

88 
"pand %%mm7, %%mm3\n\t"

89 
MOVNTQ" %%mm0, %0\n\t"

90 
MOVNTQ" %%mm1, 8%0\n\t"

91 
MOVNTQ" %%mm2, 16%0\n\t"

92 
MOVNTQ" %%mm3, 24%0"

93 
:"=m"(*dest)

94 
:"m"(*s)

95 
:"memory");

96 
dest += 32;

97 
s += 24;

98 
} 
99 
__asm __volatile(SFENCE:::"memory");

100 
__asm __volatile(EMMS:::"memory");

101 
#endif

102 
while(s < end)

103 
{ 
104 
*dest++ = *s++; 
105 
*dest++ = *s++; 
106 
*dest++ = *s++; 
107 
*dest++ = 0;

108 
} 
109 
} 
110  
111 
static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size) 
112 
{ 
113 
uint8_t *dest = dst; 
114 
const uint8_t *s = src;

115 
const uint8_t *end;

116 
#ifdef HAVE_MMX

117 
const uint8_t *mm_end;

118 
#endif

119 
end = s + src_size; 
120 
#ifdef HAVE_MMX

121 
__asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); 
122 
mm_end = end  31;

123 
while(s < mm_end)

124 
{ 
125 
__asm __volatile( 
126 
PREFETCH" 32%1\n\t"

127 
"movq %1, %%mm0\n\t"

128 
"movq 8%1, %%mm1\n\t"

129 
"movq 16%1, %%mm4\n\t"

130 
"movq 24%1, %%mm5\n\t"

131 
"movq %%mm0, %%mm2\n\t"

132 
"movq %%mm1, %%mm3\n\t"

133 
"movq %%mm4, %%mm6\n\t"

134 
"movq %%mm5, %%mm7\n\t"

135 
"psrlq $8, %%mm2\n\t"

136 
"psrlq $8, %%mm3\n\t"

137 
"psrlq $8, %%mm6\n\t"

138 
"psrlq $8, %%mm7\n\t"

139 
"pand %2, %%mm0\n\t"

140 
"pand %2, %%mm1\n\t"

141 
"pand %2, %%mm4\n\t"

142 
"pand %2, %%mm5\n\t"

143 
"pand %3, %%mm2\n\t"

144 
"pand %3, %%mm3\n\t"

145 
"pand %3, %%mm6\n\t"

146 
"pand %3, %%mm7\n\t"

147 
"por %%mm2, %%mm0\n\t"

148 
"por %%mm3, %%mm1\n\t"

149 
"por %%mm6, %%mm4\n\t"

150 
"por %%mm7, %%mm5\n\t"

151  
152 
"movq %%mm1, %%mm2\n\t"

153 
"movq %%mm4, %%mm3\n\t"

154 
"psllq $48, %%mm2\n\t"

155 
"psllq $32, %%mm3\n\t"

156 
"pand %4, %%mm2\n\t"

157 
"pand %5, %%mm3\n\t"

158 
"por %%mm2, %%mm0\n\t"

159 
"psrlq $16, %%mm1\n\t"

160 
"psrlq $32, %%mm4\n\t"

161 
"psllq $16, %%mm5\n\t"

162 
"por %%mm3, %%mm1\n\t"

163 
"pand %6, %%mm5\n\t"

164 
"por %%mm5, %%mm4\n\t"

165  
166 
MOVNTQ" %%mm0, %0\n\t"

167 
MOVNTQ" %%mm1, 8%0\n\t"

168 
MOVNTQ" %%mm4, 16%0"

169 
:"=m"(*dest)

170 
:"m"(*s),"m"(mask24l), 
171 
"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) 
172 
:"memory");

173 
dest += 24;

174 
s += 32;

175 
} 
176 
__asm __volatile(SFENCE:::"memory");

177 
__asm __volatile(EMMS:::"memory");

178 
#endif

179 
while(s < end)

180 
{ 
181 
*dest++ = *s++; 
182 
*dest++ = *s++; 
183 
*dest++ = *s++; 
184 
s++; 
185 
} 
186 
} 
187  
188 
/*

189 
Original by Strepto/Astral

190 
ported to gcc & bugfixed : A'rpi

191 
MMX2, 3DNOW optimization by Nick Kurshev

192 
32bit c version, and and&add trick by Michael Niedermayer

193 
*/

194 
static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size) 
195 
{ 
196 
register const uint8_t* s=src; 
197 
register uint8_t* d=dst;

198 
register const uint8_t *end; 
199 
const uint8_t *mm_end;

200 
end = s + src_size; 
201 
#ifdef HAVE_MMX

202 
__asm __volatile(PREFETCH" %0"::"m"(*s)); 
203 
__asm __volatile("movq %0, %%mm4"::"m"(mask15s)); 
204 
mm_end = end  15;

205 
while(s<mm_end)

206 
{ 
207 
__asm __volatile( 
208 
PREFETCH" 32%1\n\t"

209 
"movq %1, %%mm0\n\t"

210 
"movq 8%1, %%mm2\n\t"

211 
"movq %%mm0, %%mm1\n\t"

212 
"movq %%mm2, %%mm3\n\t"

213 
"pand %%mm4, %%mm0\n\t"

214 
"pand %%mm4, %%mm2\n\t"

215 
"paddw %%mm1, %%mm0\n\t"

216 
"paddw %%mm3, %%mm2\n\t"

217 
MOVNTQ" %%mm0, %0\n\t"

218 
MOVNTQ" %%mm2, 8%0"

219 
:"=m"(*d)

220 
:"m"(*s)

221 
); 
222 
d+=16;

223 
s+=16;

224 
} 
225 
__asm __volatile(SFENCE:::"memory");

226 
__asm __volatile(EMMS:::"memory");

227 
#endif

228 
mm_end = end  3;

229 
while(s < mm_end)

230 
{ 
231 
register unsigned x= *((uint32_t *)s); 
232 
*((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0); 
233 
d+=4;

234 
s+=4;

235 
} 
236 
if(s < end)

237 
{ 
238 
register unsigned short x= *((uint16_t *)s); 
239 
*((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0); 
240 
} 
241 
} 
242  
243 
static inline void RENAME(bgr24torgb24)(const uint8_t *src, uint8_t *dst, unsigned src_size) 
244 
{ 
245 
unsigned j,i,num_pixels=src_size/3; 
246 
for(i=0,j=0; j<num_pixels; i+=3,j+=3) 
247 
{ 
248 
dst[j+0] = src[i+2]; 
249 
dst[j+1] = src[i+1]; 
250 
dst[j+2] = src[i+0]; 
251 
} 
252 
} 
253  
254 
static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size) 
255 
{ 
256 
const uint8_t *s = src;

257 
const uint8_t *end;

258 
#ifdef HAVE_MMX

259 
const uint8_t *mm_end;

260 
#endif

261 
uint16_t *d = (uint16_t *)dst; 
262 
end = s + src_size; 
263 
#ifdef HAVE_MMX

264 
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); 
265 
__asm __volatile( 
266 
"movq %0, %%mm7\n\t"

267 
"movq %1, %%mm6\n\t"

268 
::"m"(red_16mask),"m"(green_16mask)); 
269 
mm_end = end  15;

270 
while(s < mm_end)

271 
{ 
272 
__asm __volatile( 
273 
PREFETCH" 32%1\n\t"

274 
"movd %1, %%mm0\n\t"

275 
"movd 4%1, %%mm3\n\t"

276 
"punpckldq 8%1, %%mm0\n\t"

277 
"punpckldq 12%1, %%mm3\n\t"

278 
"movq %%mm0, %%mm1\n\t"

279 
"movq %%mm0, %%mm2\n\t"

280 
"movq %%mm3, %%mm4\n\t"

281 
"movq %%mm3, %%mm5\n\t"

282 
"psrlq $3, %%mm0\n\t"

283 
"psrlq $3, %%mm3\n\t"

284 
"pand %2, %%mm0\n\t"

285 
"pand %2, %%mm3\n\t"

286 
"psrlq $5, %%mm1\n\t"

287 
"psrlq $5, %%mm4\n\t"

288 
"pand %%mm6, %%mm1\n\t"

289 
"pand %%mm6, %%mm4\n\t"

290 
"psrlq $8, %%mm2\n\t"

291 
"psrlq $8, %%mm5\n\t"

292 
"pand %%mm7, %%mm2\n\t"

293 
"pand %%mm7, %%mm5\n\t"

294 
"por %%mm1, %%mm0\n\t"

295 
"por %%mm4, %%mm3\n\t"

296 
"por %%mm2, %%mm0\n\t"

297 
"por %%mm5, %%mm3\n\t"

298 
"psllq $16, %%mm3\n\t"

299 
"por %%mm3, %%mm0\n\t"

300 
MOVNTQ" %%mm0, %0\n\t"

301 
:"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); 
302 
d += 4;

303 
s += 16;

304 
} 
305 
__asm __volatile(SFENCE:::"memory");

306 
__asm __volatile(EMMS:::"memory");

307 
#endif

308 
while(s < end)

309 
{ 
310 
const int b= *s++; 
311 
const int g= *s++; 
312 
const int r= *s++; 
313 
*d++ = (b>>3)  ((g&0xFC)<<3)  ((r&0xF8)<<8); 
314 
s++; 
315 
} 
316 
} 
317  
318 
static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size) 
319 
{ 
320 
const uint8_t *s = src;

321 
const uint8_t *end;

322 
#ifdef HAVE_MMX

323 
const uint8_t *mm_end;

324 
#endif

325 
uint16_t *d = (uint16_t *)dst; 
326 
end = s + src_size; 
327 
#ifdef HAVE_MMX

328 
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); 
329 
__asm __volatile( 
330 
"movq %0, %%mm7\n\t"

331 
"movq %1, %%mm6\n\t"

332 
::"m"(red_15mask),"m"(green_15mask)); 
333 
mm_end = end  15;

334 
while(s < mm_end)

335 
{ 
336 
__asm __volatile( 
337 
PREFETCH" 32%1\n\t"

338 
"movd %1, %%mm0\n\t"

339 
"movd 4%1, %%mm3\n\t"

340 
"punpckldq 8%1, %%mm0\n\t"

341 
"punpckldq 12%1, %%mm3\n\t"

342 
"movq %%mm0, %%mm1\n\t"

343 
"movq %%mm0, %%mm2\n\t"

344 
"movq %%mm3, %%mm4\n\t"

345 
"movq %%mm3, %%mm5\n\t"

346 
"psrlq $3, %%mm0\n\t"

347 
"psrlq $3, %%mm3\n\t"

348 
"pand %2, %%mm0\n\t"

349 
"pand %2, %%mm3\n\t"

350 
"psrlq $6, %%mm1\n\t"

351 
"psrlq $6, %%mm4\n\t"

352 
"pand %%mm6, %%mm1\n\t"

353 
"pand %%mm6, %%mm4\n\t"

354 
"psrlq $9, %%mm2\n\t"

355 
"psrlq $9, %%mm5\n\t"

356 
"pand %%mm7, %%mm2\n\t"

357 
"pand %%mm7, %%mm5\n\t"

358 
"por %%mm1, %%mm0\n\t"

359 
"por %%mm4, %%mm3\n\t"

360 
"por %%mm2, %%mm0\n\t"

361 
"por %%mm5, %%mm3\n\t"

362 
"psllq $16, %%mm3\n\t"

363 
"por %%mm3, %%mm0\n\t"

364 
MOVNTQ" %%mm0, %0\n\t"

365 
:"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); 
366 
d += 4;

367 
s += 16;

368 
} 
369 
__asm __volatile(SFENCE:::"memory");

370 
__asm __volatile(EMMS:::"memory");

371 
#endif

372 
while(s < end)

373 
{ 
374 
const int b= *s++; 
375 
const int g= *s++; 
376 
const int r= *s++; 
377 
*d++ = (b>>3)  ((g&0xF8)<<2)  ((r&0xF8)<<7); 
378 
s++; 
379 
} 
380 
} 
381  
382 
static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size) 
383 
{ 
384 
const uint8_t *s = src;

385 
const uint8_t *end;

386 
#ifdef HAVE_MMX

387 
const uint8_t *mm_end;

388 
#endif

389 
uint16_t *d = (uint16_t *)dst; 
390 
end = s + src_size; 
391 
#ifdef HAVE_MMX

392 
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); 
393 
__asm __volatile( 
394 
"movq %0, %%mm7\n\t"

395 
"movq %1, %%mm6\n\t"

396 
::"m"(red_16mask),"m"(green_16mask)); 
397 
mm_end = end  11;

398 
while(s < mm_end)

399 
{ 
400 
__asm __volatile( 
401 
PREFETCH" 32%1\n\t"

402 
"movd %1, %%mm0\n\t"

403 
"movd 3%1, %%mm3\n\t"

404 
"punpckldq 6%1, %%mm0\n\t"

405 
"punpckldq 9%1, %%mm3\n\t"

406 
"movq %%mm0, %%mm1\n\t"

407 
"movq %%mm0, %%mm2\n\t"

408 
"movq %%mm3, %%mm4\n\t"

409 
"movq %%mm3, %%mm5\n\t"

410 
"psrlq $3, %%mm0\n\t"

411 
"psrlq $3, %%mm3\n\t"

412 
"pand %2, %%mm0\n\t"

413 
"pand %2, %%mm3\n\t"

414 
"psrlq $5, %%mm1\n\t"

415 
"psrlq $5, %%mm4\n\t"

416 
"pand %%mm6, %%mm1\n\t"

417 
"pand %%mm6, %%mm4\n\t"

418 
"psrlq $8, %%mm2\n\t"

419 
"psrlq $8, %%mm5\n\t"

420 
"pand %%mm7, %%mm2\n\t"

421 
"pand %%mm7, %%mm5\n\t"

422 
"por %%mm1, %%mm0\n\t"

423 
"por %%mm4, %%mm3\n\t"

424 
"por %%mm2, %%mm0\n\t"

425 
"por %%mm5, %%mm3\n\t"

426 
"psllq $16, %%mm3\n\t"

427 
"por %%mm3, %%mm0\n\t"

428 
MOVNTQ" %%mm0, %0\n\t"

429 
:"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); 
430 
d += 4;

431 
s += 12;

432 
} 
433 
__asm __volatile(SFENCE:::"memory");

434 
__asm __volatile(EMMS:::"memory");

435 
#endif

436 
while(s < end)

437 
{ 
438 
const int b= *s++; 
439 
const int g= *s++; 
440 
const int r= *s++; 
441 
*d++ = (b>>3)  ((g&0xFC)<<3)  ((r&0xF8)<<8); 
442 
} 
443 
} 
444  
445 
static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size) 
446 
{ 
447 
const uint8_t *s = src;

448 
const uint8_t *end;

449 
#ifdef HAVE_MMX

450 
const uint8_t *mm_end;

451 
#endif

452 
uint16_t *d = (uint16_t *)dst; 
453 
end = s + src_size; 
454 
#ifdef HAVE_MMX

455 
__asm __volatile(PREFETCH" %0"::"m"(*src):"memory"); 
456 
__asm __volatile( 
457 
"movq %0, %%mm7\n\t"

458 
"movq %1, %%mm6\n\t"

459 
::"m"(red_15mask),"m"(green_15mask)); 
460 
mm_end = end  11;

461 
while(s < mm_end)

462 
{ 
463 
__asm __volatile( 
464 
PREFETCH" 32%1\n\t"

465 
"movd %1, %%mm0\n\t"

466 
"movd 3%1, %%mm3\n\t"

467 
"punpckldq 6%1, %%mm0\n\t"

468 
"punpckldq 9%1, %%mm3\n\t"

469 
"movq %%mm0, %%mm1\n\t"

470 
"movq %%mm0, %%mm2\n\t"

471 
"movq %%mm3, %%mm4\n\t"

472 
"movq %%mm3, %%mm5\n\t"

473 
"psrlq $3, %%mm0\n\t"

474 
"psrlq $3, %%mm3\n\t"

475 
"pand %2, %%mm0\n\t"

476 
"pand %2, %%mm3\n\t"

477 
"psrlq $6, %%mm1\n\t"

478 
"psrlq $6, %%mm4\n\t"

479 
"pand %%mm6, %%mm1\n\t"

480 
"pand %%mm6, %%mm4\n\t"

481 
"psrlq $9, %%mm2\n\t"

482 
"psrlq $9, %%mm5\n\t"

483 
"pand %%mm7, %%mm2\n\t"

484 
"pand %%mm7, %%mm5\n\t"

485 
"por %%mm1, %%mm0\n\t"

486 
"por %%mm4, %%mm3\n\t"

487 
"por %%mm2, %%mm0\n\t"

488 
"por %%mm5, %%mm3\n\t"

489 
"psllq $16, %%mm3\n\t"

490 
"por %%mm3, %%mm0\n\t"

491 
MOVNTQ" %%mm0, %0\n\t"

492 
:"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); 
493 
d += 4;

494 
s += 12;

495 
} 
496 
__asm __volatile(SFENCE:::"memory");

497 
__asm __volatile(EMMS:::"memory");

498 
#endif

499 
while(s < end)

500 
{ 
501 
const int b= *s++; 
502 
const int g= *s++; 
503 
const int r= *s++; 
504 
*d++ = (b>>3)  ((g&0xF8)<<2)  ((r&0xF8)<<7); 
505 
} 
506 
} 
507  
508 
/*

509 
I use here less accurate approximation by simply

510 
leftshifting the input

511 
value and filling the low order bits with

512 
zeroes. This method improves png's

513 
compression but this scheme cannot reproduce white exactly, since it does not

514 
generate an allones maximum value; the net effect is to darken the

515 
image slightly.

516 

517 
The better method should be "left bit replication":

518 

519 
4 3 2 1 0

520 


521 
1 1 0 1 1

522 

523 
7 6 5 4 3 2 1 0

524 


525 
1 1 0 1 1 1 1 0

526 
======= ===

527 
 Leftmost Bits Repeated to Fill Open Bits

528 


529 
Original Bits

530 
*/

531 
static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size) 
532 
{ 
533 
const uint16_t *end;

534 
#ifdef HAVE_MMX

535 
const uint16_t *mm_end;

536 
#endif

537 
uint8_t *d = (uint8_t *)dst; 
538 
const uint16_t *s = (uint16_t *)src;

539 
end = s + src_size/2;

540 
#ifdef HAVE_MMX

541 
__asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); 
542 
mm_end = end  7;

543 
while(s < mm_end)

544 
{ 
545 
__asm __volatile( 
546 
PREFETCH" 32%1\n\t"

547 
"movq %1, %%mm0\n\t"

548 
"movq %1, %%mm1\n\t"

549 
"movq %1, %%mm2\n\t"

550 
"pand %2, %%mm0\n\t"

551 
"pand %3, %%mm1\n\t"

552 
"pand %4, %%mm2\n\t"

553 
"psllq $3, %%mm0\n\t"

554 
"psrlq $2, %%mm1\n\t"

555 
"psrlq $7, %%mm2\n\t"

556 
"movq %%mm0, %%mm3\n\t"

557 
"movq %%mm1, %%mm4\n\t"

558 
"movq %%mm2, %%mm5\n\t"

559 
"punpcklwd %5, %%mm0\n\t"

560 
"punpcklwd %5, %%mm1\n\t"

561 
"punpcklwd %5, %%mm2\n\t"

562 
"punpckhwd %5, %%mm3\n\t"

563 
"punpckhwd %5, %%mm4\n\t"

564 
"punpckhwd %5, %%mm5\n\t"

565 
"psllq $8, %%mm1\n\t"

566 
"psllq $16, %%mm2\n\t"

567 
"por %%mm1, %%mm0\n\t"

568 
"por %%mm2, %%mm0\n\t"

569 
"psllq $8, %%mm4\n\t"

570 
"psllq $16, %%mm5\n\t"

571 
"por %%mm4, %%mm3\n\t"

572 
"por %%mm5, %%mm3\n\t"

573  
574 
"movq %%mm0, %%mm6\n\t"

575 
"movq %%mm3, %%mm7\n\t"

576 

577 
"movq 8%1, %%mm0\n\t"

578 
"movq 8%1, %%mm1\n\t"

579 
"movq 8%1, %%mm2\n\t"

580 
"pand %2, %%mm0\n\t"

581 
"pand %3, %%mm1\n\t"

582 
"pand %4, %%mm2\n\t"

583 
"psllq $3, %%mm0\n\t"

584 
"psrlq $2, %%mm1\n\t"

585 
"psrlq $7, %%mm2\n\t"

586 
"movq %%mm0, %%mm3\n\t"

587 
"movq %%mm1, %%mm4\n\t"

588 
"movq %%mm2, %%mm5\n\t"

589 
"punpcklwd %5, %%mm0\n\t"

590 
"punpcklwd %5, %%mm1\n\t"

591 
"punpcklwd %5, %%mm2\n\t"

592 
"punpckhwd %5, %%mm3\n\t"

593 
"punpckhwd %5, %%mm4\n\t"

594 
"punpckhwd %5, %%mm5\n\t"

595 
"psllq $8, %%mm1\n\t"

596 
"psllq $16, %%mm2\n\t"

597 
"por %%mm1, %%mm0\n\t"

598 
"por %%mm2, %%mm0\n\t"

599 
"psllq $8, %%mm4\n\t"

600 
"psllq $16, %%mm5\n\t"

601 
"por %%mm4, %%mm3\n\t"

602 
"por %%mm5, %%mm3\n\t"

603  
604 
:"=m"(*d)

605 
:"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null) 
606 
:"memory");

607 
/* Borrowed 32 to 24 */

608 
__asm __volatile( 
609 
"movq %%mm0, %%mm4\n\t"

610 
"movq %%mm3, %%mm5\n\t"

611 
"movq %%mm6, %%mm0\n\t"

612 
"movq %%mm7, %%mm1\n\t"

613 

614 
"movq %%mm4, %%mm6\n\t"

615 
"movq %%mm5, %%mm7\n\t"

616 
"movq %%mm0, %%mm2\n\t"

617 
"movq %%mm1, %%mm3\n\t"

618  
619 
"psrlq $8, %%mm2\n\t"

620 
"psrlq $8, %%mm3\n\t"

621 
"psrlq $8, %%mm6\n\t"

622 
"psrlq $8, %%mm7\n\t"

623 
"pand %2, %%mm0\n\t"

624 
"pand %2, %%mm1\n\t"

625 
"pand %2, %%mm4\n\t"

626 
"pand %2, %%mm5\n\t"

627 
"pand %3, %%mm2\n\t"

628 
"pand %3, %%mm3\n\t"

629 
"pand %3, %%mm6\n\t"

630 
"pand %3, %%mm7\n\t"

631 
"por %%mm2, %%mm0\n\t"

632 
"por %%mm3, %%mm1\n\t"

633 
"por %%mm6, %%mm4\n\t"

634 
"por %%mm7, %%mm5\n\t"

635  
636 
"movq %%mm1, %%mm2\n\t"

637 
"movq %%mm4, %%mm3\n\t"

638 
"psllq $48, %%mm2\n\t"

639 
"psllq $32, %%mm3\n\t"

640 
"pand %4, %%mm2\n\t"

641 
"pand %5, %%mm3\n\t"

642 
"por %%mm2, %%mm0\n\t"

643 
"psrlq $16, %%mm1\n\t"

644 
"psrlq $32, %%mm4\n\t"

645 
"psllq $16, %%mm5\n\t"

646 
"por %%mm3, %%mm1\n\t"

647 
"pand %6, %%mm5\n\t"

648 
"por %%mm5, %%mm4\n\t"

649  
650 
MOVNTQ" %%mm0, %0\n\t"

651 
MOVNTQ" %%mm1, 8%0\n\t"

652 
MOVNTQ" %%mm4, 16%0"

653  
654 
:"=m"(*d)

655 
:"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) 
656 
:"memory");

657 
d += 24;

658 
s += 8;

659 
} 
660 
__asm __volatile(SFENCE:::"memory");

661 
__asm __volatile(EMMS:::"memory");

662 
#endif

663 
while(s < end)

664 
{ 
665 
register uint16_t bgr;

666 
bgr = *s++; 
667 
*d++ = (bgr&0x1F)<<3; 
668 
*d++ = (bgr&0x3E0)>>2; 
669 
*d++ = (bgr&0x7C00)>>7; 
670 
} 
671 
} 
672  
673 
static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size) 
674 
{ 
675 
const uint16_t *end;

676 
#ifdef HAVE_MMX

677 
const uint16_t *mm_end;

678 
#endif

679 
uint8_t *d = (uint8_t *)dst; 
680 
const uint16_t *s = (const uint16_t *)src; 
681 
end = s + src_size/2;

682 
#ifdef HAVE_MMX

683 
__asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); 
684 
mm_end = end  7;

685 
while(s < mm_end)

686 
{ 
687 
__asm __volatile( 
688 
PREFETCH" 32%1\n\t"

689 
"movq %1, %%mm0\n\t"

690 
"movq %1, %%mm1\n\t"

691 
"movq %1, %%mm2\n\t"

692 
"pand %2, %%mm0\n\t"

693 
"pand %3, %%mm1\n\t"

694 
"pand %4, %%mm2\n\t"

695 
"psllq $3, %%mm0\n\t"

696 
"psrlq $3, %%mm1\n\t"

697 
"psrlq $8, %%mm2\n\t"

698 
"movq %%mm0, %%mm3\n\t"

699 
"movq %%mm1, %%mm4\n\t"

700 
"movq %%mm2, %%mm5\n\t"

701 
"punpcklwd %5, %%mm0\n\t"

702 
"punpcklwd %5, %%mm1\n\t"

703 
"punpcklwd %5, %%mm2\n\t"

704 
"punpckhwd %5, %%mm3\n\t"

705 
"punpckhwd %5, %%mm4\n\t"

706 
"punpckhwd %5, %%mm5\n\t"

707 
"psllq $8, %%mm1\n\t"

708 
"psllq $16, %%mm2\n\t"

709 
"por %%mm1, %%mm0\n\t"

710 
"por %%mm2, %%mm0\n\t"

711 
"psllq $8, %%mm4\n\t"

712 
"psllq $16, %%mm5\n\t"

713 
"por %%mm4, %%mm3\n\t"

714 
"por %%mm5, %%mm3\n\t"

715 

716 
"movq %%mm0, %%mm6\n\t"

717 
"movq %%mm3, %%mm7\n\t"

718  
719 
"movq 8%1, %%mm0\n\t"

720 
"movq 8%1, %%mm1\n\t"

721 
"movq 8%1, %%mm2\n\t"

722 
"pand %2, %%mm0\n\t"

723 
"pand %3, %%mm1\n\t"

724 
"pand %4, %%mm2\n\t"

725 
"psllq $3, %%mm0\n\t"

726 
"psrlq $3, %%mm1\n\t"

727 
"psrlq $8, %%mm2\n\t"

728 
"movq %%mm0, %%mm3\n\t"

729 
"movq %%mm1, %%mm4\n\t"

730 
"movq %%mm2, %%mm5\n\t"

731 
"punpcklwd %5, %%mm0\n\t"

732 
"punpcklwd %5, %%mm1\n\t"

733 
"punpcklwd %5, %%mm2\n\t"

734 
"punpckhwd %5, %%mm3\n\t"

735 
"punpckhwd %5, %%mm4\n\t"

736 
"punpckhwd %5, %%mm5\n\t"

737 
"psllq $8, %%mm1\n\t"

738 
"psllq $16, %%mm2\n\t"

739 
"por %%mm1, %%mm0\n\t"

740 
"por %%mm2, %%mm0\n\t"

741 
"psllq $8, %%mm4\n\t"

742 
"psllq $16, %%mm5\n\t"

743 
"por %%mm4, %%mm3\n\t"

744 
"por %%mm5, %%mm3\n\t"

745 
:"=m"(*d)

746 
:"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null) 
747 
:"memory");

748 
/* Borrowed 32 to 24 */

749 
__asm __volatile( 
750 
"movq %%mm0, %%mm4\n\t"

751 
"movq %%mm3, %%mm5\n\t"

752 
"movq %%mm6, %%mm0\n\t"

753 
"movq %%mm7, %%mm1\n\t"

754 

755 
"movq %%mm4, %%mm6\n\t"

756 
"movq %%mm5, %%mm7\n\t"

757 
"movq %%mm0, %%mm2\n\t"

758 
"movq %%mm1, %%mm3\n\t"

759  
760 
"psrlq $8, %%mm2\n\t"

761 
"psrlq $8, %%mm3\n\t"

762 
"psrlq $8, %%mm6\n\t"

763 
"psrlq $8, %%mm7\n\t"

764 
"pand %2, %%mm0\n\t"

765 
"pand %2, %%mm1\n\t"

766 
"pand %2, %%mm4\n\t"

767 
"pand %2, %%mm5\n\t"

768 
"pand %3, %%mm2\n\t"

769 
"pand %3, %%mm3\n\t"

770 
"pand %3, %%mm6\n\t"

771 
"pand %3, %%mm7\n\t"

772 
"por %%mm2, %%mm0\n\t"

773 
"por %%mm3, %%mm1\n\t"

774 
"por %%mm6, %%mm4\n\t"

775 
"por %%mm7, %%mm5\n\t"

776  
777 
"movq %%mm1, %%mm2\n\t"

778 
"movq %%mm4, %%mm3\n\t"

779 
"psllq $48, %%mm2\n\t"

780 
"psllq $32, %%mm3\n\t"

781 
"pand %4, %%mm2\n\t"

782 
"pand %5, %%mm3\n\t"

783 
"por %%mm2, %%mm0\n\t"

784 
"psrlq $16, %%mm1\n\t"

785 
"psrlq $32, %%mm4\n\t"

786 
"psllq $16, %%mm5\n\t"

787 
"por %%mm3, %%mm1\n\t"

788 
"pand %6, %%mm5\n\t"

789 
"por %%mm5, %%mm4\n\t"

790  
791 
MOVNTQ" %%mm0, %0\n\t"

792 
MOVNTQ" %%mm1, 8%0\n\t"

793 
MOVNTQ" %%mm4, 16%0"

794  
795 
:"=m"(*d)

796 
:"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh) 
797 
:"memory");

798 
d += 24;

799 
s += 8;

800 
} 
801 
__asm __volatile(SFENCE:::"memory");

802 
__asm __volatile(EMMS:::"memory");

803 
#endif

804 
while(s < end)

805 
{ 
806 
register uint16_t bgr;

807 
bgr = *s++; 
808 
*d++ = (bgr&0x1F)<<3; 
809 
*d++ = (bgr&0x7E0)>>3; 
810 
*d++ = (bgr&0xF800)>>8; 
811 
} 
812 
} 
813  
814 
static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size) 
815 
{ 
816 
const uint16_t *end;

817 
#ifdef HAVE_MMX

818 
const uint16_t *mm_end;

819 
#endif

820 
uint8_t *d = (uint8_t *)dst; 
821 
const uint16_t *s = (const uint16_t *)src; 
822 
end = s + src_size/2;

823 
#ifdef HAVE_MMX

824 
__asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); 
825 
__asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory"); 
826 
mm_end = end  3;

827 
while(s < mm_end)

828 
{ 
829 
__asm __volatile( 
830 
PREFETCH" 32%1\n\t"

831 
"movq %1, %%mm0\n\t"

832 
"movq %1, %%mm1\n\t"

833 
"movq %1, %%mm2\n\t"

834 
"pand %2, %%mm0\n\t"

835 
"pand %3, %%mm1\n\t"

836 
"pand %4, %%mm2\n\t"

837 
"psllq $3, %%mm0\n\t"

838 
"psrlq $2, %%mm1\n\t"

839 
"psrlq $7, %%mm2\n\t"

840 
"movq %%mm0, %%mm3\n\t"

841 
"movq %%mm1, %%mm4\n\t"

842 
"movq %%mm2, %%mm5\n\t"

843 
"punpcklwd %%mm7, %%mm0\n\t"

844 
"punpcklwd %%mm7, %%mm1\n\t"

845 
"punpcklwd %%mm7, %%mm2\n\t"

846 
"punpckhwd %%mm7, %%mm3\n\t"

847 
"punpckhwd %%mm7, %%mm4\n\t"

848 
"punpckhwd %%mm7, %%mm5\n\t"

849 
"psllq $8, %%mm1\n\t"

850 
"psllq $16, %%mm2\n\t"

851 
"por %%mm1, %%mm0\n\t"

852 
"por %%mm2, %%mm0\n\t"

853 
"psllq $8, %%mm4\n\t"

854 
"psllq $16, %%mm5\n\t"

855 
"por %%mm4, %%mm3\n\t"

856 
"por %%mm5, %%mm3\n\t"

857 
MOVNTQ" %%mm0, %0\n\t"

858 
MOVNTQ" %%mm3, 8%0\n\t"

859 
:"=m"(*d)

860 
:"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r) 
861 
:"memory");

862 
d += 16;

863 
s += 4;

864 
} 
865 
__asm __volatile(SFENCE:::"memory");

866 
__asm __volatile(EMMS:::"memory");

867 
#endif

868 
while(s < end)

869 
{ 
870 
register uint16_t bgr;

871 
bgr = *s++; 
872 
*d++ = (bgr&0x1F)<<3; 
873 
*d++ = (bgr&0x3E0)>>2; 
874 
*d++ = (bgr&0x7C00)>>7; 
875 
*d++ = 0;

876 
} 
877 
} 
878  
879 
static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size) 
880 
{ 
881 
const uint16_t *end;

882 
#ifdef HAVE_MMX

883 
const uint16_t *mm_end;

884 
#endif

885 
uint8_t *d = (uint8_t *)dst; 
886 
const uint16_t *s = (uint16_t *)src;

887 
end = s + src_size/2;

888 
#ifdef HAVE_MMX

889 
__asm __volatile(PREFETCH" %0"::"m"(*s):"memory"); 
890 
__asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory"); 
891 
mm_end = end  3;

892 
while(s < mm_end)

893 
{ 
894 
__asm __volatile( 
895 
PREFETCH" 32%1\n\t"

896 
"movq %1, %%mm0\n\t"

897 
"movq %1, %%mm1\n\t"

898 
"movq %1, %%mm2\n\t"

899 
"pand %2, %%mm0\n\t"

900 
"pand %3, %%mm1\n\t"

901 
"pand %4, %%mm2\n\t"

902 
"psllq $3, %%mm0\n\t"

903 
"psrlq $3, %%mm1\n\t"

904 
"psrlq $8, %%mm2\n\t"

905 
"movq %%mm0, %%mm3\n\t"

906 
"movq %%mm1, %%mm4\n\t"

907 
"movq %%mm2, %%mm5\n\t"

908 
"punpcklwd %%mm7, %%mm0\n\t"

909 
"punpcklwd %%mm7, %%mm1\n\t"

910 
"punpcklwd %%mm7, %%mm2\n\t"

911 
"punpckhwd %%mm7, %%mm3\n\t"

912 
"punpckhwd %%mm7, %%mm4\n\t"

913 
"punpckhwd %%mm7, %%mm5\n\t"

914 
"psllq $8, %%mm1\n\t"

915 
"psllq $16, %%mm2\n\t"

916 
"por %%mm1, %%mm0\n\t"

917 
"por %%mm2, %%mm0\n\t"

918 
"psllq $8, %%mm4\n\t"

919 
"psllq $16, %%mm5\n\t"

920 
"por %%mm4, %%mm3\n\t"

921 
"por %%mm5, %%mm3\n\t"

922 
MOVNTQ" %%mm0, %0\n\t"

923 
MOVNTQ" %%mm3, 8%0\n\t"

924 
:"=m"(*d)

925 
:"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r) 
926 
:"memory");

927 
d += 16;

928 
s += 4;

929 
} 
930 
__asm __volatile(SFENCE:::"memory");

931 
__asm __volatile(EMMS:::"memory");

932 
#endif

933 
while(s < end)

934 
{ 
935 
register uint16_t bgr;

936 
bgr = *s++; 
937 
*d++ = (bgr&0x1F)<<3; 
938 
*d++ = (bgr&0x7E0)>>3; 
939 
*d++ = (bgr&0xF800)>>8; 
940 
*d++ = 0;

941 
} 
942 
} 
943  
944 
static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size) 
945 
{ 
946 
#ifdef HAVE_MMX

947 
/* TODO: unroll this loop */

948 
asm volatile ( 
949 
"xorl %%eax, %%eax \n\t"

950 
".balign 16 \n\t"

951 
"1: \n\t"

952 
PREFETCH" 32(%0, %%eax) \n\t"

953 
"movq (%0, %%eax), %%mm0 \n\t"

954 
"movq %%mm0, %%mm1 \n\t"

955 
"movq %%mm0, %%mm2 \n\t"

956 
"pslld $16, %%mm0 \n\t"

957 
"psrld $16, %%mm1 \n\t"

958 
"pand "MANGLE(mask32r)", %%mm0 \n\t" 
959 
"pand "MANGLE(mask32g)", %%mm2 \n\t" 
960 
"pand "MANGLE(mask32b)", %%mm1 \n\t" 
961 
"por %%mm0, %%mm2 \n\t"

962 
"por %%mm1, %%mm2 \n\t"

963 
MOVNTQ" %%mm2, (%1, %%eax) \n\t"

964 
"addl $8, %%eax \n\t"

965 
"cmpl %2, %%eax \n\t"

966 
" jb 1b \n\t"

967 
:: "r" (src), "r"(dst), "r" (src_size7) 
968 
: "%eax"

969 
); 
970  
971 
__asm __volatile(SFENCE:::"memory");

972 
__asm __volatile(EMMS:::"memory");

973 
#else

974 
unsigned i;

975 
unsigned num_pixels = src_size >> 2; 
976 
for(i=0; i<num_pixels; i++) 
977 
{ 
978 
dst[4*i + 0] = src[4*i + 2]; 
979 
dst[4*i + 1] = src[4*i + 1]; 
980 
dst[4*i + 2] = src[4*i + 0]; 
981 
} 
982 
#endif

983 
} 
984  
985 
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size) 
986 
{ 
987 
unsigned i;

988 
#ifdef HAVE_MMX

989 
int mmx_size= 23  src_size; 
990 
asm volatile ( 
991 
"movq "MANGLE(mask24r)", %%mm5 \n\t" 
992 
"movq "MANGLE(mask24g)", %%mm6 \n\t" 
993 
"movq "MANGLE(mask24b)", %%mm7 \n\t" 
994 
".balign 16 \n\t"

995 
"1: \n\t"

996 
PREFETCH" 32(%1, %%eax) \n\t"

997 
"movq (%1, %%eax), %%mm0 \n\t" // BGR BGR BG 
998 
"movq (%1, %%eax), %%mm1 \n\t" // BGR BGR BG 
999 
"movq 2(%1, %%eax), %%mm2 \n\t" // R BGR BGR B 
1000 
"psllq $16, %%mm0 \n\t" // 00 BGR BGR 
1001 
"pand %%mm5, %%mm0 \n\t"

1002 
"pand %%mm6, %%mm1 \n\t"

1003 
"pand %%mm7, %%mm2 \n\t"

1004 
"por %%mm0, %%mm1 \n\t"

1005 
"por %%mm2, %%mm1 \n\t"

1006 
"movq 6(%1, %%eax), %%mm0 \n\t" // BGR BGR BG 
1007 
MOVNTQ" %%mm1, (%2, %%eax) \n\t" // RGB RGB RG 
1008 
"movq 8(%1, %%eax), %%mm1 \n\t" // R BGR BGR B 
1009 
"movq 10(%1, %%eax), %%mm2 \n\t" // GR BGR BGR 
1010 
"pand %%mm7, %%mm0 \n\t"

1011 
"pand %%mm5, %%mm1 \n\t"

1012 
"pand %%mm6, %%mm2 \n\t"

1013 
"por %%mm0, %%mm1 \n\t"

1014 
"por %%mm2, %%mm1 \n\t"

1015 
"movq 14(%1, %%eax), %%mm0 \n\t" // R BGR BGR B 
1016 
MOVNTQ" %%mm1, 8(%2, %%eax) \n\t" // B RGB RGB R 
1017 
"movq 16(%1, %%eax), %%mm1 \n\t" // GR BGR BGR 
1018 
"movq 18(%1, %%eax), %%mm2 \n\t" // BGR BGR BG 
1019 
"pand %%mm6, %%mm0 \n\t"

1020 
"pand %%mm7, %%mm1 \n\t"

1021 
"pand %%mm5, %%mm2 \n\t"

1022 
"por %%mm0, %%mm1 \n\t"

1023 
"por %%mm2, %%mm1 \n\t"

1024 
MOVNTQ" %%mm1, 16(%2, %%eax) \n\t"

1025 
"addl $24, %%eax \n\t"

1026 
" js 1b \n\t"

1027 
: "+a" (mmx_size)

1028 
: "r" (srcmmx_size), "r"(dstmmx_size) 
1029 
); 
1030  
1031 
__asm __volatile(SFENCE:::"memory");

1032 
__asm __volatile(EMMS:::"memory");

1033  
1034 
if(mmx_size==23) return; //finihsed, was multiple of 8 
1035  
1036 
src+= src_size; 
1037 
dst+= src_size; 
1038 
src_size= 23mmx_size;

1039 
src= src_size; 
1040 
dst= src_size; 
1041 
#endif

1042 
for(i=0; i<src_size; i+=3) 
1043 
{ 
1044 
register uint8_t x;

1045 
x = src[i + 2];

1046 
dst[i + 1] = src[i + 1]; 
1047 
dst[i + 2] = src[i + 0]; 
1048 
dst[i + 0] = x;

1049 
} 
1050 
} 
1051  
1052 
static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, 
1053 
unsigned int width, unsigned int height, 
1054 
unsigned int lumStride, unsigned int chromStride, unsigned int dstStride, int vertLumPerChroma) 
1055 
{ 
1056 
unsigned y;

1057 
const unsigned chromWidth= width>>1; 
1058 
for(y=0; y<height; y++) 
1059 
{ 
1060 
#ifdef HAVE_MMX

1061 
//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)

1062 
asm volatile( 
1063 
"xorl %%eax, %%eax \n\t"

1064 
".balign 16 \n\t"

1065 
"1: \n\t"

1066 
PREFETCH" 32(%1, %%eax, 2) \n\t"

1067 
PREFETCH" 32(%2, %%eax) \n\t"

1068 
PREFETCH" 32(%3, %%eax) \n\t"

1069 
"movq (%2, %%eax), %%mm0 \n\t" // U(0) 
1070 
"movq %%mm0, %%mm2 \n\t" // U(0) 
1071 
"movq (%3, %%eax), %%mm1 \n\t" // V(0) 
1072 
"punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) 
1073 
"punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) 
1074  
1075 
"movq (%1, %%eax,2), %%mm3 \n\t" // Y(0) 
1076 
"movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8) 
1077 
"movq %%mm3, %%mm4 \n\t" // Y(0) 
1078 
"movq %%mm5, %%mm6 \n\t" // Y(8) 
1079 
"punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0) 
1080 
"punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4) 
1081 
"punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8) 
1082 
"punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12) 
1083  
1084 
MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t"

1085 
MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"

1086 
MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"

1087 
MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"

1088  
1089 
"addl $8, %%eax \n\t"

1090 
"cmpl %4, %%eax \n\t"

1091 
" jb 1b \n\t"

1092 
::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth) 
1093 
: "%eax"

1094 
); 
1095 
#else

1096 
#if __WORDSIZE >= 64 
1097 
int i;

1098 
uint64_t *ldst = (uint64_t *) dst; 
1099 
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;

1100 
for(i = 0; i < chromWidth; i += 2){ 
1101 
uint64_t k, l; 
1102 
k = yc[0] + (uc[0] << 8) + 
1103 
(yc[1] << 16) + (vc[0] << 24); 
1104 
l = yc[2] + (uc[1] << 8) + 
1105 
(yc[3] << 16) + (vc[1] << 24); 
1106 
*ldst++ = k + (l << 32);

1107 
yc += 4;

1108 
uc += 2;

1109 
vc += 2;

1110 
} 
1111  
1112 
#else

1113 
int i, *idst = (int32_t *) dst;

1114 
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;

1115 
for(i = 0; i < chromWidth; i++){ 
1116 
*idst++ = yc[0] + (uc[0] << 8) + 
1117 
(yc[1] << 16) + (vc[0] << 24); 
1118 
yc += 2;

1119 
uc++; 
1120 
vc++; 
1121 
} 
1122 
#endif

1123 
#endif

1124 
if((y&(vertLumPerChroma1))==(vertLumPerChroma1) ) 
1125 
{ 
1126 
usrc += chromStride; 
1127 
vsrc += chromStride; 
1128 
} 
1129 
ysrc += lumStride; 
1130 
dst += dstStride; 
1131 
} 
1132 
#ifdef HAVE_MMX

1133 
asm( EMMS" \n\t" 
1134 
SFENCE" \n\t"

1135 
:::"memory");

1136 
#endif

1137 
} 
1138  
1139 
/**

1140 
*

1141 
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a

1142 
* problem for anyone then tell me, and ill fix it)

1143 
*/

1144 
static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, 
1145 
unsigned int width, unsigned int height, 
1146 
unsigned int lumStride, unsigned int chromStride, unsigned int dstStride) 
1147 
{ 
1148 
//FIXME interpolate chroma

1149 
RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);

1150 
} 
1151  
1152 
/**

1153 
*

1154 
* width should be a multiple of 16

1155 
*/

1156 
static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, 
1157 
unsigned int width, unsigned int height, 
1158 
unsigned int lumStride, unsigned int chromStride, unsigned int dstStride) 
1159 
{ 
1160 
RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);

1161 
} 
1162  
1163 
/**

1164 
*

1165 
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a

1166 
* problem for anyone then tell me, and ill fix it)

1167 
*/

1168 
static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, 
1169 
unsigned int width, unsigned int height, 
1170 
unsigned int lumStride, unsigned int chromStride, unsigned int srcStride) 
1171 
{ 
1172 
unsigned y;

1173 
const unsigned chromWidth= width>>1; 
1174 
for(y=0; y<height; y+=2) 
1175 
{ 
1176 
#ifdef HAVE_MMX

1177 
asm volatile( 
1178 
"xorl %%eax, %%eax \n\t"

1179 
"pcmpeqw %%mm7, %%mm7 \n\t"

1180 
"psrlw $8, %%mm7 \n\t" // FF,00,FF,00... 
1181 
".balign 16 \n\t"

1182 
"1: \n\t"

1183 
PREFETCH" 64(%0, %%eax, 4) \n\t"

1184 
"movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) 
1185 
"movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) 
1186 
"movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0) 
1187 
"movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4) 
1188 
"psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0) 
1189 
"psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4) 
1190 
"pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0) 
1191 
"pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4) 
1192 
"packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) 
1193 
"packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) 
1194  
1195 
MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"

1196  
1197 
"movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8) 
1198 
"movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12) 
1199 
"movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8) 
1200 
"movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12) 
1201 
"psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8) 
1202 
"psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12) 
1203 
"pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8) 
1204 
"pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12) 
1205 
"packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) 
1206 
"packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) 
1207  
1208 
MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"

1209  
1210 
"movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) 
1211 
"movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) 
1212 
"psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) 
1213 
"psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) 
1214 
"pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) 
1215 
"pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) 
1216 
"packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) 
1217 
"packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) 
1218  
1219 
MOVNTQ" %%mm0, (%3, %%eax) \n\t"

1220 
MOVNTQ" %%mm2, (%2, %%eax) \n\t"

1221  
1222 
"addl $8, %%eax \n\t"

1223 
"cmpl %4, %%eax \n\t"

1224 
" jb 1b \n\t"

1225 
::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth) 
1226 
: "memory", "%eax" 
1227 
); 
1228  
1229 
ydst += lumStride; 
1230 
src += srcStride; 
1231  
1232 
asm volatile( 
1233 
"xorl %%eax, %%eax \n\t"

1234 
".balign 16 \n\t"

1235 
"1: \n\t"

1236 
PREFETCH" 64(%0, %%eax, 4) \n\t"

1237 
"movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) 
1238 
"movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) 
1239 
"movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8) 
1240 
"movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12) 
1241 
"pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0) 
1242 
"pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4) 
1243 
"pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8) 
1244 
"pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12) 
1245 
"packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) 
1246 
"packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) 
1247  
1248 
MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"

1249 
MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"

1250  
1251 
"addl $8, %%eax \n\t"

1252 
"cmpl %4, %%eax \n\t"

1253 
" jb 1b \n\t"

1254  
1255 
::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth) 
1256 
: "memory", "%eax" 
1257 
); 
1258 
#else

1259 
unsigned i;

1260 
for(i=0; i<chromWidth; i++) 
1261 
{ 
1262 
ydst[2*i+0] = src[4*i+0]; 
1263 
udst[i] = src[4*i+1]; 
1264 
ydst[2*i+1] = src[4*i+2]; 
1265 
vdst[i] = src[4*i+3]; 
1266 
} 
1267 
ydst += lumStride; 
1268 
src += srcStride; 
1269  
1270 
for(i=0; i<chromWidth; i++) 
1271 
{ 
1272 
ydst[2*i+0] = src[4*i+0]; 
1273 
ydst[2*i+1] = src[4*i+2]; 
1274 
} 
1275 
#endif

1276 
udst += chromStride; 
1277 
vdst += chromStride; 
1278 
ydst += lumStride; 
1279 
src += srcStride; 
1280 
} 
1281 
#ifdef HAVE_MMX

1282 
asm volatile( EMMS" \n\t" 
1283 
SFENCE" \n\t"

1284 
:::"memory");

1285 
#endif

1286 
} 
1287  
1288 
static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, 
1289 
uint8_t *ydst, uint8_t *udst, uint8_t *vdst, 
1290 
unsigned int width, unsigned int height, unsigned int lumStride, unsigned int chromStride) 
1291 
{ 
1292 
/* Y Plane */

1293 
memcpy(ydst, ysrc, width*height); 
1294  
1295 
/* XXX: implement upscaling for U,V */

1296 
} 
1297  
1298 
static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride) 
1299 
{ 
1300 
int x,y;

1301 

1302 
// first line

1303 
for(x=0; x<srcWidth; x++){ 
1304 
dst[2*x+0]= 
1305 
dst[2*x+1]= src[x]; 
1306 
} 
1307 
dst+= dstStride; 
1308  
1309 
for(y=1; y<srcHeight; y++){ 
1310 
#if defined (HAVE_MMX2)  defined (HAVE_3DNOW)

1311 
const int mmxSize= srcWidth; 
1312 
asm volatile( 
1313 
"movl %4, %%eax \n\t"

1314 
"1: \n\t"

1315 
"movq (%0, %%eax), %%mm0 \n\t"

1316 
"movq (%1, %%eax), %%mm1 \n\t"

1317 
"movq 1(%0, %%eax), %%mm2 \n\t"

1318 
"movq 1(%1, %%eax), %%mm3 \n\t"

1319 
"movq %%mm0, %%mm4 \n\t"

1320 
"movq %%mm1, %%mm5 \n\t"

1321 
PAVGB" %%mm3, %%mm0 \n\t"

1322 
PAVGB" %%mm3, %%mm0 \n\t"

1323 
PAVGB" %%mm4, %%mm3 \n\t"

1324 
PAVGB" %%mm4, %%mm3 \n\t"

1325 
PAVGB" %%mm2, %%mm1 \n\t"

1326 
PAVGB" %%mm2, %%mm1 \n\t"

1327 
PAVGB" %%mm5, %%mm2 \n\t"

1328 
PAVGB" %%mm5, %%mm2 \n\t"

1329 
"movq %%mm3, %%mm4 \n\t"

1330 
"movq %%mm2, %%mm5 \n\t"

1331 
"punpcklbw %%mm1, %%mm3 \n\t"

1332 
"punpckhbw %%mm1, %%mm4 \n\t"

1333 
"punpcklbw %%mm0, %%mm2 \n\t"

1334 
"punpckhbw %%mm0, %%mm5 \n\t"

1335 
#if 1 
1336 
MOVNTQ" %%mm3, (%2, %%eax, 2) \n\t"

1337 
MOVNTQ" %%mm4, 8(%2, %%eax, 2) \n\t"

1338 
MOVNTQ" %%mm2, (%3, %%eax, 2) \n\t"

1339 
MOVNTQ" %%mm5, 8(%3, %%eax, 2) \n\t"

1340 
#else

1341 
"movq %%mm3, (%2, %%eax, 2) \n\t"

1342 
"movq %%mm4, 8(%2, %%eax, 2) \n\t"

1343 
"movq %%mm2, (%3, %%eax, 2) \n\t"

1344 
"movq %%mm5, 8(%3, %%eax, 2) \n\t"

1345 
#endif

1346 
"addl $8, %%eax \n\t"

1347 
" js 1b \n\t"

1348 
:: "r" (src + mmxSize1), "r" (src + srcStride + mmxSize1), 
1349 
"r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2), 
1350 
"g" (mmxSize)

1351 
: "%eax"

1352  
1353 
); 
1354 
dst[0]=

1355 
dst[dstStride]= src[0];

1356 
#else

1357 
dst[0]=

1358 
dst[dstStride]= src[0];

1359  
1360 
for(x=0; x<srcWidth1; x++){ 
1361 
dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2; 
1362 
dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2; 
1363 
dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2; 
1364 
dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2; 
1365 
} 
1366 
#endif

1367 
dst[srcWidth*2 1]= 
1368 
dst[srcWidth*2 1 + dstStride]= src[srcWidth1]; 
1369  
1370 
dst+=dstStride*2;

1371 
src+=srcStride; 
1372 
} 
1373 
src=srcStride; 
1374 

1375 
// last line

1376 
for(x=0; x<srcWidth; x++){ 
1377 
dst[2*x+0]= 
1378 
dst[2*x+1]= src[x]; 
1379 
} 
1380 
#ifdef HAVE_MMX

1381 
asm volatile( EMMS" \n\t" 
1382 
SFENCE" \n\t"

1383 
:::"memory");

1384 
#endif

1385 
} 
1386  
1387 
/**

1388 
*

1389 
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a

1390 
* problem for anyone then tell me, and ill fix it)

1391 
* chrominance data is only taken from every secound line others are ignored FIXME write HQ version

1392 
*/

1393 
static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, 
1394 
unsigned int width, unsigned int height, 
1395 
unsigned int lumStride, unsigned int chromStride, unsigned int srcStride) 
1396 
{ 
1397 
unsigned y;

1398 
const unsigned chromWidth= width>>1; 
1399 
for(y=0; y<height; y+=2) 
1400 
{ 
1401 
#ifdef HAVE_MMX

1402 
asm volatile( 
1403 
"xorl %%eax, %%eax \n\t"

1404 
"pcmpeqw %%mm7, %%mm7 \n\t"

1405 
"psrlw $8, %%mm7 \n\t" // FF,00,FF,00... 
1406 
".balign 16 \n\t"

1407 
"1: \n\t"

1408 
PREFETCH" 64(%0, %%eax, 4) \n\t"

1409 
"movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0) 
1410 
"movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4) 
1411 
"movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0) 
1412 
"movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4) 
1413 
"pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0) 
1414 
"pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4) 
1415 
"psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0) 
1416 
"psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4) 
1417 
"packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) 
1418 
"packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) 
1419  
1420 
MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"

1421  
1422 
"movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8) 
1423 
"movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12) 
1424 
"movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8) 
1425 
"movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12) 
1426 
"pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8) 
1427 
"pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12) 
1428 
"psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8) 
1429 
"psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12) 
1430 
"packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) 
1431 
"packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) 
1432  
1433 
MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"

1434  
1435 
"movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) 
1436 
"movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) 
1437 
"psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) 
1438 
"psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) 
1439 
"pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) 
1440 
"pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) 
1441 
"packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) 
1442 
"packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) 
1443  
1444 
MOVNTQ" %%mm0, (%3, %%eax) \n\t"

1445 
MOVNTQ" %%mm2, (%2, %%eax) \n\t"

1446  
1447 
"addl $8, %%eax \n\t"

1448 
"cmpl %4, %%eax \n\t"

1449 
" jb 1b \n\t"

1450 
::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth) 
1451 
: "memory", "%eax" 
1452 
); 
1453  
1454 
ydst += lumStride; 
1455 
src += srcStride; 
1456  
1457 
asm volatile( 
1458 
"xorl %%eax, %%eax \n\t"

1459 
".balign 16 \n\t"

1460 
"1: \n\t"

1461 
PREFETCH" 64(%0, %%eax, 4) \n\t"

1462 
"movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0) 
1463 
"movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4) 
1464 
"movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8) 
1465 
"movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12) 
1466 
"psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0) 
1467 
"psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4) 
1468 
"psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8) 
1469 
"psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12) 
1470 
"packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) 
1471 
"packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) 
1472  
1473 
MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"

1474 
MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"

1475  
1476 
"addl $8, %%eax \n\t"

1477 
"cmpl %4, %%eax \n\t"

1478 
" jb 1b \n\t"

1479  
1480 
::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth) 
1481 
: "memory", "%eax" 
1482 
); 
1483 
#else

1484 
unsigned i;

1485 
for(i=0; i<chromWidth; i++) 
1486 
{ 
1487 
udst[i] = src[4*i+0]; 
1488 
ydst[2*i+0] = src[4*i+1]; 
1489 
vdst[i] = src[4*i+2]; 
1490 
ydst[2*i+1] = src[4*i+3]; 
1491 
} 
1492 
ydst += lumStride; 
1493 
src += srcStride; 
1494  
1495 
for(i=0; i<chromWidth; i++) 
1496 
{ 
1497 
ydst[2*i+0] = src[4*i+1]; 
1498 
ydst[2*i+1] = src[4*i+3]; 
1499 
} 
1500 
#endif

1501 
udst += chromStride; 
1502 
vdst += chromStride; 
1503 
ydst += lumStride; 
1504 
src += srcStride; 
1505 
} 
1506 
#ifdef HAVE_MMX

1507 
asm volatile( EMMS" \n\t" 
1508 
SFENCE" \n\t"

1509 
:::"memory");

1510 
#endif

1511 
} 
1512  
1513 
/**

1514 
*

1515 
* height should be a multiple of 2 and width should be a multiple of 2 (if this is a

1516 
* problem for anyone then tell me, and ill fix it)

1517 
* chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version

1518 
*/

1519 
static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, 
1520 
unsigned int width, unsigned int height, 
1521 
unsigned int lumStride, unsigned int chromStride, unsigned int srcStride) 
1522 
{ 
1523 
unsigned y;

1524 
const unsigned chromWidth= width>>1; 
1525 
#ifdef HAVE_MMX

1526 
for(y=0; y<height2; y+=2) 
1527 
{ 
1528 
unsigned i;

1529 
for(i=0; i<2; i++) 
1530 
{ 
1531 
asm volatile( 
1532 
"movl %2, %%eax \n\t"

1533 
"movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t" 
1534 
"movq "MANGLE(w1111)", %%mm5 \n\t" 
1535 
"pxor %%mm7, %%mm7 \n\t"

1536 
"leal (%%eax, %%eax, 2), %%ebx \n\t"

1537 
".balign 16 \n\t"

1538 
"1: \n\t"

1539 
PREFETCH" 64(%0, %%ebx) \n\t"

1540 
"movd (%0, %%ebx), %%mm0 \n\t"

1541 
"movd 3(%0, %%ebx), %%mm1 \n\t"

1542 
"punpcklbw %%mm7, %%mm0 \n\t"

1543 
"punpcklbw %%mm7, %%mm1 \n\t"

1544 
"movd 6(%0, %%ebx), %%mm2 \n\t"

1545 
"movd 9(%0, %%ebx), %%mm3 \n\t"

1546 
"punpcklbw %%mm7, %%mm2 \n\t"

1547 
"punpcklbw %%mm7, %%mm3 \n\t"

1548 
"pmaddwd %%mm6, %%mm0 \n\t"

1549 
"pmaddwd %%mm6, %%mm1 \n\t"

1550 
"pmaddwd %%mm6, %%mm2 \n\t"

1551 
"pmaddwd %%mm6, %%mm3 \n\t"

1552 
#ifndef FAST_BGR2YV12

1553 
"psrad $8, %%mm0 \n\t"

1554 
"psrad $8, %%mm1 \n\t"

1555 
"psrad $8, %%mm2 \n\t"

1556 
"psrad $8, %%mm3 \n\t"

1557 
#endif

1558 
"packssdw %%mm1, %%mm0 \n\t"

1559 
"packssdw %%mm3, %%mm2 \n\t"

1560 
"pmaddwd %%mm5, %%mm0 \n\t"

1561 
"pmaddwd %%mm5, %%mm2 \n\t"

1562 
"packssdw %%mm2, %%mm0 \n\t"

1563 
"psraw $7, %%mm0 \n\t"

1564  
1565 
"movd 12(%0, %%ebx), %%mm4 \n\t"

1566 
"movd 15(%0, %%ebx), %%mm1 \n\t"

1567 
"punpcklbw %%mm7, %%mm4 \n\t"

1568 
"punpcklbw %%mm7, %%mm1 \n\t"

1569 
"movd 18(%0, %%ebx), %%mm2 \n\t"

1570 
"movd 21(%0, %%ebx), %%mm3 \n\t"

1571 
"punpcklbw %%mm7, %%mm2 \n\t"

1572 
"punpcklbw %%mm7, %%mm3 \n\t"

1573 
"pmaddwd %%mm6, %%mm4 \n\t"

1574 
"pmaddwd %%mm6, %%mm1 \n\t"

1575 
"pmaddwd %%mm6, %%mm2 \n\t"

1576 
"pmaddwd %%mm6, %%mm3 \n\t"

1577 
#ifndef FAST_BGR2YV12

1578 
"psrad $8, %%mm4 \n\t"

1579 
"psrad $8, %%mm1 \n\t"

1580 
"psrad $8, %%mm2 \n\t"

1581 
"psrad $8, %%mm3 \n\t"

1582 
#endif

1583 
"packssdw %%mm1, %%mm4 \n\t"

1584 
"packssdw %%mm3, %%mm2 \n\t"

1585 
"pmaddwd %%mm5, %%mm4 \n\t"

1586 
"pmaddwd %%mm5, %%mm2 \n\t"

1587 
"addl $24, %%ebx \n\t"

1588 
"packssdw %%mm2, %%mm4 \n\t"

1589 
"psraw $7, %%mm4 \n\t"

1590  
1591 
"packuswb %%mm4, %%mm0 \n\t"

1592 
"paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t" 
1593  
1594 
MOVNTQ" %%mm0, (%1, %%eax) \n\t"

1595 
"addl $8, %%eax \n\t"

1596 
" js 1b \n\t"

1597 
: : "r" (src+width*3), "r" (ydst+width), "g" (width) 
1598 
: "%eax", "%ebx" 
1599 
); 
1600 
ydst += lumStride; 
1601 
src += srcStride; 
1602 
} 
1603 
src = srcStride*2;

1604 
asm volatile( 
1605 
"movl %4, %%eax \n\t"

1606 
"movq "MANGLE(w1111)", %%mm5 \n\t" 
1607 
"movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t" 
1608 
"pxor %%mm7, %%mm7 \n\t"

1609 
"leal (%%eax, %%eax, 2), %%ebx \n\t"

1610 
"addl %%ebx, %%ebx \n\t"

1611 
".balign 16 \n\t"

1612 
"1: \n\t"

1613 
PREFETCH" 64(%0, %%ebx) \n\t"

1614 
PREFETCH" 64(%1, %%ebx) \n\t"

1615 
#if defined (HAVE_MMX2)  defined (HAVE_3DNOW)

1616 
"movq (%0, %%ebx), %%mm0 \n\t"

1617 
"movq (%1, %%ebx), %%mm1 \n\t"

1618 
"movq 6(%0, %%ebx), %%mm2 \n\t"

1619 
"movq 6(%1, %%ebx), %%mm3 \n\t"

1620 
PAVGB" %%mm1, %%mm0 \n\t"

1621 
PAVGB" %%mm3, %%mm2 \n\t"

1622 
"movq %%mm0, %%mm1 \n\t"

1623 
"movq %%mm2, %%mm3 \n\t"

1624 
"psrlq $24, %%mm0 \n\t"

1625 
"psrlq $24, %%mm2 \n\t"

1626 
PAVGB" %%mm1, %%mm0 \n\t"

1627 
PAVGB" %%mm3, %%mm2 \n\t"

1628 
"punpcklbw %%mm7, %%mm0 \n\t"

1629 
"punpcklbw %%mm7, %%mm2 \n\t"

1630 
#else

1631 
"movd (%0, %%ebx), %%mm0 \n\t"

1632 
"movd (%1, %%ebx), %%mm1 \n\t"

1633 
"movd 3(%0, %%ebx), %%mm2 \n\t"

1634 
"movd 3(%1, %%ebx), %%mm3 \n\t"

1635 
"punpcklbw %%mm7, %%mm0 \n\t"

1636 
"punpcklbw %%mm7, %%mm1 \n\t"

1637 
"punpcklbw %%mm7, %%mm2 \n\t"

1638 
"punpcklbw %%mm7, %%mm3 \n\t"

1639 
"paddw %%mm1, %%mm0 \n\t"

1640 
"paddw %%mm3, %%mm2 \n\t"

1641 
"paddw %%mm2, %%mm0 \n\t"

1642 
"movd 6(%0, %%ebx), %%mm4 \n\t"

1643 
"movd 6(%1, %%ebx), %%mm1 \n\t"

1644 
"movd 9(%0, %%ebx), %%mm2 \n\t"

1645 
"movd 9(%1, %%ebx), %%mm3 \n\t"

1646 
"punpcklbw %%mm7, %%mm4 \n\t"

1647 
"punpcklbw %%mm7, %%mm1 \n\t"

1648 
"punpcklbw %%mm7, %%mm2 \n\t"

1649 
"punpcklbw %%mm7, %%mm3 \n\t"

1650 
"paddw %%mm1, %%mm4 \n\t"

1651 
"paddw %%mm3, %%mm2 \n\t"

1652 
"paddw %%mm4, %%mm2 \n\t"

1653 
"psrlw $2, %%mm0 \n\t"

1654 
"psrlw $2, %%mm2 \n\t"

1655 
#endif

1656 
"movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" 
1657 
"movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" 
1658  
1659 
"pmaddwd %%mm0, %%mm1 \n\t"

1660 
"pmaddwd %%mm2, %%mm3 \n\t"

1661 
"pmaddwd %%mm6, %%mm0 \n\t"

1662 
"pmaddwd %%mm6, %%mm2 \n\t"

1663 
#ifndef FAST_BGR2YV12

1664 
"psrad $8, %%mm0 \n\t"

1665 
"psrad $8, %%mm1 \n\t"

1666 
"psrad $8, %%mm2 \n\t"

1667 
"psrad $8, %%mm3 \n\t"

1668 
#endif

1669 
"packssdw %%mm2, %%mm0 \n\t"

1670 
"packssdw %%mm3, %%mm1 \n\t"

1671 
"pmaddwd %%mm5, %%mm0 \n\t"

1672 
"pmaddwd %%mm5, %%mm1 \n\t"

1673 
"packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 
1674 
"psraw $7, %%mm0 \n\t"

1675  
1676 
#if defined (HAVE_MMX2)  defined (HAVE_3DNOW)

1677 
"movq 12(%0, %%ebx), %%mm4 \n\t"

1678 
"movq 12(%1, %%ebx), %%mm1 \n\t"

1679 
"movq 18(%0, %%ebx), %%mm2 \n\t"

1680 
"movq 18(%1, %%ebx), %%mm3 \n\t"

1681 
PAVGB" %%mm1, %%mm4 \n\t"

1682 
PAVGB" %%mm3, %%mm2 \n\t"

1683 
"movq %%mm4, %%mm1 \n\t"

1684 
"movq %%mm2, %%mm3 \n\t"

1685 
"psrlq $24, %%mm4 \n\t"

1686 
"psrlq $24, %%mm2 \n\t"

1687 
PAVGB" %%mm1, %%mm4 \n\t"

1688 
PAVGB" %%mm3, %%mm2 \n\t"

1689 
"punpcklbw %%mm7, %%mm4 \n\t"

1690 
"punpcklbw %%mm7, %%mm2 \n\t"

1691 
#else

1692 
"movd 12(%0, %%ebx), %%mm4 \n\t"

1693 
"movd 12(%1, %%ebx), %%mm1 \n\t"

1694 
"movd 15(%0, %%ebx), %%mm2 \n\t"

1695 
"movd 15(%1, %%ebx), %%mm3 \n\t"

1696 
"punpcklbw %%mm7, %%mm4 \n\t"

1697 
"punpcklbw %%mm7, %%mm1 \n\t"

1698 
"punpcklbw %%mm7, %%mm2 \n\t"

1699 
"punpcklbw %%mm7, %%mm3 \n\t"

1700 
"paddw %%mm1, %%mm4 \n\t"

1701 
"paddw %%mm3, %%mm2 \n\t"

1702 
"paddw %%mm2, %%mm4 \n\t"

1703 
"movd 18(%0, %%ebx), %%mm5 \n\t"

1704 
"movd 18(%1, %%ebx), %%mm1 \n\t"

1705 
"movd 21(%0, %%ebx), %%mm2 \n\t"

1706 
"movd 21(%1, %%ebx), %%mm3 \n\t"

1707 
"punpcklbw %%mm7, %%mm5 \n\t"

1708 
"punpcklbw %%mm7, %%mm1 \n\t"

1709 
"punpcklbw %%mm7, %%mm2 \n\t"

1710 
"punpcklbw %%mm7, %%mm3 \n\t"

1711 
"paddw %%mm1, %%mm5 \n\t"

1712 
"paddw %%mm3, %%mm2 \n\t"

1713 
"paddw %%mm5, %%mm2 \n\t"

1714 
"movq "MANGLE(w1111)", %%mm5 \n\t" 
1715 
"psrlw $2, %%mm4 \n\t"

1716 
"psrlw $2, %%mm2 \n\t"

1717 
#endif

1718 
"movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t" 
1719 
"movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t" 
1720  
1721 
"pmaddwd %%mm4, %%mm1 \n\t"

1722 
"pmaddwd %%mm2, %%mm3 \n\t"

1723 
"pmaddwd %%mm6, %%mm4 \n\t"

1724 
"pmaddwd %%mm6, %%mm2 \n\t"

1725 
#ifndef FAST_BGR2YV12

1726 
"psrad $8, %%mm4 \n\t"

1727 
"psrad $8, %%mm1 \n\t"

1728 
"psrad $8, %%mm2 \n\t"

1729 
"psrad $8, %%mm3 \n\t"

1730 
#endif

1731 
"packssdw %%mm2, %%mm4 \n\t"

1732 
"packssdw %%mm3, %%mm1 \n\t"

1733 
"pmaddwd %%mm5, %%mm4 \n\t"

1734 
"pmaddwd %%mm5, %%mm1 \n\t"

1735 
"addl $24, %%ebx \n\t"

1736 
"packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 
1737 
"psraw $7, %%mm4 \n\t"

1738  
1739 
"movq %%mm0, %%mm1 \n\t"

1740 
"punpckldq %%mm4, %%mm0 \n\t"

1741 
"punpckhdq %%mm4, %%mm1 \n\t"

1742 
"packsswb %%mm1, %%mm0 \n\t"

1743 
"paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t" 
1744  
1745 
"movd %%mm0, (%2, %%eax) \n\t"

1746 
"punpckhdq %%mm0, %%mm0 \n\t"

1747 
"movd %%mm0, (%3, %%eax) \n\t"

1748 
"addl $4, %%eax \n\t"

1749 
" js 1b \n\t"

1750 
: : "r" (src+width*6), "r" (src+srcStride+width*6), "r" (udst+width), "r" (vdst+width), "g" (width) 
1751 
: "%eax", "%ebx" 
1752 
); 
1753  
1754 
udst += chromStride; 
1755 
vdst += chromStride; 
1756 
src += srcStride*2;

1757 
} 
1758  
1759 
asm volatile( EMMS" \n\t" 
1760 
SFENCE" \n\t"

1761 
:::"memory");

1762 
#else

1763 
y=0;

1764 
#endif

1765 
for(; y<height; y+=2) 
1766 
{ 
1767 
unsigned i;

1768 
for(i=0; i<chromWidth; i++) 
1769 
{ 
1770 
unsigned int b= src[6*i+0]; 
1771 
unsigned int g= src[6*i+1]; 
1772 
unsigned int r= src[6*i+2]; 
1773  
1774 
unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; 
1775 
unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128; 
1776 
unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128; 
1777  
1778 
udst[i] = U; 
1779 
vdst[i] = V; 
1780 
ydst[2*i] = Y;

1781  
1782 
b= src[6*i+3]; 
1783 
g= src[6*i+4]; 
1784 
r= src[6*i+5]; 
1785  
1786 
Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;

1787 
ydst[2*i+1] = Y; 
1788 
} 
1789 
ydst += lumStride; 
1790 
src += srcStride; 
1791  
1792 
for(i=0; i<chromWidth; i++) 
1793 
{ 
1794 
unsigned int b= src[6*i+0]; 
1795 
unsigned int g= src[6*i+1]; 
1796 
unsigned int r= src[6*i+2]; 
1797  
1798 
unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16; 
1799  
1800 
ydst[2*i] = Y;

1801  
1802 
b= src[6*i+3]; 
1803 
g= src[6*i+4]; 
1804 
r= src[6*i+5]; 
1805  
1806 
Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;

1807 
ydst[2*i+1] = Y; 
1808 
} 
1809 
udst += chromStride; 
1810 
vdst += chromStride; 
1811 
ydst += lumStride; 
1812 
src += srcStride; 
1813 
} 
1814 
} 
1815  
1816 
void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,

1817 
unsigned width, unsigned height, unsigned src1Stride, 
1818 
unsigned src2Stride, unsigned dstStride){ 
1819 
unsigned h;

1820  
1821 
for(h=0; h < height; h++) 
1822 
{ 
1823 
unsigned w;

1824  
1825 
#ifdef HAVE_MMX

1826 
#ifdef HAVE_SSE2

1827 
asm(

1828 
"xorl %%eax, %%eax \n\t"

1829 
"1: \n\t"

1830 
PREFETCH" 64(%1, %%eax) \n\t"

1831 
PREFETCH" 64(%2, %%eax) \n\t"

1832 
"movdqa (%1, %%eax), %%xmm0 \n\t"

1833 
"movdqa (%1, %%eax), %%xmm1 \n\t"

1834 
"movdqa (%2, %%eax), %%xmm2 \n\t"

1835 
"punpcklbw %%xmm2, %%xmm0 \n\t"

1836 
"punpckhbw %%xmm2, %%xmm1 \n\t"

1837 
"movntdq %%xmm0, (%0, %%eax, 2) \n\t"

1838 
"movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"

1839 
"addl $16, %%eax \n\t"

1840 
"cmpl %3, %%eax \n\t"

1841 
" jb 1b \n\t"

1842 
::"r"(dest), "r"(src1), "r"(src2), "r" (width15) 
1843 
: "memory", "%eax" 
1844 
); 
1845 
#else

1846 
asm(

1847 
"xorl %%eax, %%eax \n\t"

1848 
"1: \n\t"

1849 
PREFETCH" 64(%1, %%eax) \n\t"

1850 
PREFETCH" 64(%2, %%eax) \n\t"

1851 
"movq (%1, %%eax), %%mm0 \n\t"

1852 
"movq 8(%1, %%eax), %%mm2 \n\t"

1853 
"movq %%mm0, %%mm1 \n\t"

1854 
"movq %%mm2, %%mm3 \n\t"

1855 
"movq (%2, %%eax), %%mm4 \n\t"

1856 
"movq 8(%2, %%eax), %%mm5 \n\t"

1857 
"punpcklbw %%mm4, %%mm0 \n\t"

1858 
"punpckhbw %%mm4, %%mm1 \n\t"

1859 
"punpcklbw %%mm5, %%mm2 \n\t"

1860 
"punpckhbw %%mm5, %%mm3 \n\t"

1861 
MOVNTQ" %%mm0, (%0, %%eax, 2) \n\t"

1862 
MOVNTQ" %%mm1, 8(%0, %%eax, 2) \n\t"

1863 
MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t"

1864 
MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t"

1865 
"addl $16, %%eax \n\t"

1866 
"cmpl %3, %%eax \n\t"

1867 
" jb 1b \n\t"

1868 
::"r"(dest), "r"(src1), "r"(src2), "r" (width15) 
1869 
: "memory", "%eax" 
1870 
); 
1871 
#endif

1872 
for(w= (width&(~15)); w < width; w++) 
1873 
{ 
1874 
dest[2*w+0] = src1[w]; 
1875 
dest[2*w+1] = src2[w]; 
1876 
} 
1877 
#else

1878 
for(w=0; w < width; w++) 
1879 
{ 
1880 
dest[2*w+0] = src1[w]; 
1881 
dest[2*w+1] = src2[w]; 
1882 
} 
1883 
#endif

1884 
dest += dstStride; 
1885 
src1 += src1Stride; 
1886 
src2 += src2Stride; 
1887 
} 
1888 
#ifdef HAVE_MMX

1889 
asm(

1890 
EMMS" \n\t"

1891 
SFENCE" \n\t"

1892 
::: "memory"

1893 
); 
1894 
#endif

1895 
} 