static void swrDrawTriangle(swr_context* ctx, int32_t x0, int32_t y0, int32_t x1, int32_t y1, int32_t x2, int32_t y2, uint32_t color0, uint32_t color1, uint32_t color2)
{
int32_t iarea = (x2 - x0) * (y1 - y0) - (x1 - x0) * (y2 - y0);
if (iarea == 0) {
// Degenerate triangle with 0 area.
return;
} else if (iarea < 0) {
// Swap (x1, y1) <-> (x2, y2)
{ int32_t tmp = x1; x1 = x2; x2 = tmp; }
{ int32_t tmp = y1; y1 = y2; y2 = tmp; }
{ uint32_t tmp = color1; color1 = color2; color2 = tmp; }
iarea = -iarea;
}
const int32_t dx20 = x2 - x0;
const int32_t dx10 = x1 - x0;
const int32_t dy20 = y2 - y0;
const int32_t dy10 = y1 - y0;
const int32_t bboxMinX = swr_maxi(swr_min3i(x0, x1, x2), 0);
const int32_t bboxMinY = swr_maxi(swr_min3i(y0, y1, y2), 0);
const int32_t bboxMaxX = swr_mini(swr_max3i(x0, x1, x2), (int32_t)ctx->m_Width - 1);
const int32_t bboxMaxY = swr_mini(swr_max3i(y0, y1, y2), (int32_t)ctx->m_Height - 1);
const int32_t bboxWidth = bboxMaxX - bboxMinX;
const int32_t bboxHeight = bboxMaxY - bboxMinY;
const __m128i xmm_zero = _mm_setzero_si128();
const __m128 xmm_c0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color0), xmm_zero), xmm_zero));
const __m128 xmm_c1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color1), xmm_zero), xmm_zero));
const __m128 xmm_c2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_loadu_si32(&color2), xmm_zero), xmm_zero));
const int32_t dy01 = -dy10;
const int32_t dx01 = -dx10;
const int32_t dy01_dy20 = dy01 + dy20;
const __m128 xmm_inv_area = _mm_set1_ps(1.0f / (float)iarea);
const float inv_dy01 = 1.0f / (float)dy01;
const float inv_dy20 = 1.0f / (float)dy20;
const float inv_dy01_dy20 = 1.0f / (float)dy01_dy20;
const int32_t dx0min = x0 - bboxMinX;
const int32_t dy0min = y0 - bboxMinY;
#if 0
const __m128i imm_div = _mm_set_epi32(0, -(dx01 + dx20), dx20, dx01);
const int32_t iv0 = dx0min * dy01 - dy0min * dx01;
const int32_t iv1 = dx0min * dy20 - dy0min * dx20;
const int32_t iv2 = iarea - iv0 - iv1;
__m128i imm_iv = _mm_set_epi32(0, iv2, iv1, iv0);
#else
int32_t iv[4] = { 0 };
iv[0] = dx0min * dy01 - dy0min * dx01;
iv[1] = dx0min * dy20 - dy0min * dx20;
iv[2] = iarea - iv[0] - iv[1];
const int32_t div[4] = {
dx01,
dx20,
-(dx01 + dx20),
0
};
#endif
const __m128i imm_diu = _mm_set_epi32(0, dy01_dy20, -dy20, -dy01);
uint32_t* framebufferRow = &ctx->m_FrameBuffer[bboxMinX + bboxMinY * ctx->m_Width];
for (int32_t iy = 0; iy <= bboxHeight; ++iy) {
int32_t ixmin = 0;
int32_t ixmax = (uint32_t)bboxWidth;
// Calculate ixmin and ixmax
{
#if 0
int32_t iv[4];
_mm_storeu_si128((__m128i*)&iv[0], imm_iv);
#endif
if (dy01 > 0) {
ixmax = swr_mini(ixmax, (int32_t)floorf((float)iv[0] * inv_dy01));
} else if (iv[0] != 0) {
ixmin = swr_maxi(ixmin, (int32_t)ceilf((float)iv[0] * inv_dy01));
}
if (dy20 > 0) {
ixmax = swr_mini(ixmax, (int32_t)floorf((float)iv[1] * inv_dy20));
} else if (iv[1] != 0) {
ixmin = swr_maxi(ixmin, (int32_t)ceilf((float)iv[1] * inv_dy20));
}
if (dy01_dy20 < 0 && iv[2] >= 0) {
ixmax = swr_mini(ixmax, -(int32_t)ceilf((float)iv[2] * inv_dy01_dy20));
} else if (dy01_dy20 > 0 && iv[2] < 0) {
ixmin = swr_maxi(ixmin, -(int32_t)floorf((float)iv[2] * inv_dy01_dy20));
}
}
#if 0
// TODO: Avoid _mm_set_epi32 (requires 32-bit integer multiply)
__m128i imm_iu = _mm_add_epi32(imm_iv, _mm_set_epi32(0, ixmin * dy01_dy20, -ixmin * dy20, -ixmin * dy01));
#else
__m128i imm_iu = _mm_set_epi32(0, iv[2] + ixmin * dy01_dy20, iv[1] - ixmin * dy20, iv[0] - ixmin * dy01);
#endif
for (int32_t ix = ixmin; ix <= ixmax; ++ix) {
const __m128 xmm_bc = _mm_mul_ps(_mm_cvtepi32_ps(imm_iu), xmm_inv_area);
const __m128 xmm_bc0 = _mm_shuffle_ps(xmm_bc, xmm_bc, _MM_SHUFFLE(0, 0, 0, 0));
const __m128 xmm_bc1 = _mm_shuffle_ps(xmm_bc, xmm_bc, _MM_SHUFFLE(1, 1, 1, 1));
const __m128 xmm_bc2 = _mm_shuffle_ps(xmm_bc, xmm_bc, _MM_SHUFFLE(2, 2, 2, 2));
const __m128 xmm_c0_scaled = _mm_mul_ps(xmm_c0, xmm_bc2);
const __m128 xmm_c1_scaled = _mm_mul_ps(xmm_c1, xmm_bc1);
const __m128 xmm_c2_scaled = _mm_mul_ps(xmm_c2, xmm_bc0);
const __m128 xmm_c = _mm_add_ps(_mm_add_ps(xmm_c0_scaled, xmm_c1_scaled), xmm_c2_scaled);
const __m128i imm_c = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(xmm_c), xmm_zero), xmm_zero);
_mm_storeu_si32(&framebufferRow[ix], imm_c);
imm_iu = _mm_add_epi32(imm_iu, imm_diu);
}
#if 0
imm_iv = _mm_add_epi32(imm_iv, imm_div);
#else
iv[0] += div[0];
iv[1] += div[1];
iv[2] += div[2];
#endif
framebufferRow += ctx->m_Width;
}
}