@@ -85,36 +85,6 @@ PremultiplyVector_NEON(const uint16x8_t& aSrc) {
8585 return vsriq_n_u16 (ga, rb, 8 );
8686}
8787
88- template <bool aSwapRB, bool aOpaqueAlpha>
89- static MOZ_ALWAYS_INLINE void PremultiplyChunk_NEON (const uint8_t *& aSrc,
90- uint8_t *& aDst,
91- int32_t aAlignedRow,
92- int32_t aRemainder) {
93- // Process all 4-pixel chunks as one vector.
94- for (const uint8_t * end = aSrc + aAlignedRow; aSrc < end;) {
95- uint16x8_t px = vld1q_u16 (reinterpret_cast <const uint16_t *>(aSrc));
96- px = PremultiplyVector_NEON<aSwapRB, aOpaqueAlpha>(px);
97- vst1q_u16 (reinterpret_cast <uint16_t *>(aDst), px);
98- aSrc += 4 * 4 ;
99- aDst += 4 * 4 ;
100- }
101-
102- // Handle any 1-3 remaining pixels.
103- if (aRemainder) {
104- uint16x8_t px = LoadRemainder_NEON (aSrc, aRemainder);
105- px = PremultiplyVector_NEON<aSwapRB, aOpaqueAlpha>(px);
106- StoreRemainder_NEON (aDst, aRemainder, px);
107- }
108- }
109-
110- template <bool aSwapRB, bool aOpaqueAlpha>
111- void PremultiplyRow_NEON (const uint8_t * aSrc, uint8_t * aDst, int32_t aLength) {
112- int32_t alignedRow = 4 * (aLength & ~3 );
113- int32_t remainder = aLength & 3 ;
114- PremultiplyChunk_NEON<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow,
115- remainder);
116- }
117-
11888template <bool aSwapRB, bool aOpaqueAlpha>
11989void Premultiply_NEON (const uint8_t * aSrc, int32_t aSrcGap, uint8_t * aDst,
12090 int32_t aDstGap, IntSize aSize) {
@@ -125,22 +95,28 @@ void Premultiply_NEON(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
12595 aDstGap += 4 * remainder;
12696
12797 for (int32_t height = aSize.height ; height > 0 ; height--) {
128- PremultiplyChunk_NEON<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow,
129- remainder);
98+ // Process all 4-pixel chunks as one vector.
99+ for (const uint8_t * end = aSrc + alignedRow; aSrc < end;) {
100+ uint16x8_t px = vld1q_u16 (reinterpret_cast <const uint16_t *>(aSrc));
101+ px = PremultiplyVector_NEON<aSwapRB, aOpaqueAlpha>(px);
102+ vst1q_u16 (reinterpret_cast <uint16_t *>(aDst), px);
103+ aSrc += 4 * 4 ;
104+ aDst += 4 * 4 ;
105+ }
106+
107+ // Handle any 1-3 remaining pixels.
108+ if (remainder) {
109+ uint16x8_t px = LoadRemainder_NEON (aSrc, remainder);
110+ px = PremultiplyVector_NEON<aSwapRB, aOpaqueAlpha>(px);
111+ StoreRemainder_NEON (aDst, remainder, px);
112+ }
113+
130114 aSrc += aSrcGap;
131115 aDst += aDstGap;
132116 }
133117}
134118
135119// Force instantiation of premultiply variants here.
136- template void PremultiplyRow_NEON<false , false >(const uint8_t *, uint8_t *,
137- int32_t );
138- template void PremultiplyRow_NEON<false , true >(const uint8_t *, uint8_t *,
139- int32_t );
140- template void PremultiplyRow_NEON<true , false >(const uint8_t *, uint8_t *,
141- int32_t );
142- template void PremultiplyRow_NEON<true , true >(const uint8_t *, uint8_t *,
143- int32_t );
144120template void Premultiply_NEON<false , false >(const uint8_t *, int32_t , uint8_t *,
145121 int32_t , IntSize);
146122template void Premultiply_NEON<false , true >(const uint8_t *, int32_t , uint8_t *,
@@ -282,7 +258,7 @@ template void Unpremultiply_NEON<true>(const uint8_t*, int32_t, uint8_t*,
282258
283259// Swizzle a vector of 4 pixels providing swaps and opaquifying.
284260template <bool aSwapRB, bool aOpaqueAlpha>
285- static MOZ_ALWAYS_INLINE uint16x8_t SwizzleVector_NEON (const uint16x8_t & aSrc) {
261+ MOZ_ALWAYS_INLINE uint16x8_t SwizzleVector_NEON (const uint16x8_t & aSrc) {
286262 // Swap R and B, then add to G and A (forced to 255):
287263 // (((src>>16) | (src << 16)) & 0x00FF00FF) |
288264 // ((src | 0xFF000000) & ~0x00FF00FF)
@@ -299,50 +275,21 @@ static MOZ_ALWAYS_INLINE uint16x8_t SwizzleVector_NEON(const uint16x8_t& aSrc) {
299275
300276// Optimized implementations for when there is no R and B swap.
301277template<>
302- static MOZ_ALWAYS_INLINE uint16x8_t
278+ MOZ_ALWAYS_INLINE uint16x8_t
303279SwizzleVector_NEON<false, true>(const uint16x8_t& aSrc)
304280{
305281 // Force alpha to 255.
306282 return vorrq_u16(aSrc, vreinterpretq_u16_u32(vdupq_n_u32(0xFF000000)));
307283}
308284
309285template<>
310- static MOZ_ALWAYS_INLINE uint16x8_t
286+ MOZ_ALWAYS_INLINE uint16x8_t
311287SwizzleVector_NEON<false, false>(const uint16x8_t& aSrc)
312288{
313289 return aSrc;
314290}
315291#endif
316292
317- template <bool aSwapRB, bool aOpaqueAlpha>
318- static MOZ_ALWAYS_INLINE void SwizzleChunk_NEON (const uint8_t *& aSrc,
319- uint8_t *& aDst,
320- int32_t aAlignedRow,
321- int32_t aRemainder) {
322- // Process all 4-pixel chunks as one vector.
323- for (const uint8_t * end = aSrc + aAlignedRow; aSrc < end;) {
324- uint16x8_t px = vld1q_u16 (reinterpret_cast <const uint16_t *>(aSrc));
325- px = SwizzleVector_NEON<aSwapRB, aOpaqueAlpha>(px);
326- vst1q_u16 (reinterpret_cast <uint16_t *>(aDst), px);
327- aSrc += 4 * 4 ;
328- aDst += 4 * 4 ;
329- }
330-
331- // Handle any 1-3 remaining pixels.
332- if (aRemainder) {
333- uint16x8_t px = LoadRemainder_NEON (aSrc, aRemainder);
334- px = SwizzleVector_NEON<aSwapRB, aOpaqueAlpha>(px);
335- StoreRemainder_NEON (aDst, aRemainder, px);
336- }
337- }
338-
339- template <bool aSwapRB, bool aOpaqueAlpha>
340- void SwizzleRow_NEON (const uint8_t * aSrc, uint8_t * aDst, int32_t aLength) {
341- int32_t alignedRow = 4 * (aLength & ~3 );
342- int32_t remainder = aLength & 3 ;
343- SwizzleChunk_NEON<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow, remainder);
344- }
345-
346293template <bool aSwapRB, bool aOpaqueAlpha>
347294void Swizzle_NEON (const uint8_t * aSrc, int32_t aSrcGap, uint8_t * aDst,
348295 int32_t aDstGap, IntSize aSize) {
@@ -353,16 +300,28 @@ void Swizzle_NEON(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
353300 aDstGap += 4 * remainder;
354301
355302 for (int32_t height = aSize.height ; height > 0 ; height--) {
356- SwizzleChunk_NEON<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow,
357- remainder);
303+ // Process all 4-pixel chunks as one vector.
304+ for (const uint8_t * end = aSrc + alignedRow; aSrc < end;) {
305+ uint16x8_t px = vld1q_u16 (reinterpret_cast <const uint16_t *>(aSrc));
306+ px = SwizzleVector_NEON<aSwapRB, aOpaqueAlpha>(px);
307+ vst1q_u16 (reinterpret_cast <uint16_t *>(aDst), px);
308+ aSrc += 4 * 4 ;
309+ aDst += 4 * 4 ;
310+ }
311+
312+ // Handle any 1-3 remaining pixels.
313+ if (remainder) {
314+ uint16x8_t px = LoadRemainder_NEON (aSrc, remainder);
315+ px = SwizzleVector_NEON<aSwapRB, aOpaqueAlpha>(px);
316+ StoreRemainder_NEON (aDst, remainder, px);
317+ }
318+
358319 aSrc += aSrcGap;
359320 aDst += aDstGap;
360321 }
361322}
362323
363324// Force instantiation of swizzle variants here.
364- template void SwizzleRow_NEON<true , false >(const uint8_t *, uint8_t *, int32_t );
365- template void SwizzleRow_NEON<true , true >(const uint8_t *, uint8_t *, int32_t );
366325template void Swizzle_NEON<true , false >(const uint8_t *, int32_t , uint8_t *,
367326 int32_t , IntSize);
368327template void Swizzle_NEON<true , true >(const uint8_t *, int32_t , uint8_t *,
0 commit comments