<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML//EN">
<html>

<head>
<!-- DOC-FACTORY Generated Tags BEGIN -->
<TITLE>fdctintrinsic </TITLE>
<meta name = "creation_date" content = "18-May-98">
<meta name = "stop_date" content = "18-May-99">
<meta name = "next_check_date" content = "18-Nov-98">
<meta name = "last_check_date" content = "18-May-98">
<meta name = "web_author_id" content = "blconley">
<meta name = "language" content = "English">
<meta name = "country" content = "USA">

<meta http-equiv="Content-Type"
content="text/html; charset=iso-8859-1">
<meta name="GENERATOR" content="Microsoft FrontPage 2.0">
<!-- /DOC-FACTORY Generated Tags END -->
</head>

<body bgcolor="#FFFFFF">

<pre>/*
 * Perform the forward DCT on one block of samples.
 */

GLOBAL (void)
jpeg_fdct_ifast_intrinsic (DCTELEM * data)
{
  DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  DCTELEM tmp10, tmp11, tmp12, tmp13;
  DCTELEM z1, z2, z3, z4, z5, z11, z13;
  DCTELEM *dataptr;
  int ctr;
  SHIFT_TEMPS

	/* Variables for intrinsic code */
	__m64 m0=0, m1=0, m2=0, m3=0, m4=0, m5=0, m6=0, m7=0, m8=0, m9=0;
	__m64	*dataptr2;

	/* This union allows us to represent memory in different sizes */
	union q64 {
		DCTELEM	*w;
		int32	*d;
		__m64	*q;
	} dataptr1;

	dataptr = data;			// pointer to data buffer
	dataptr1.w = dataptr;		// set our pointer to data buffer; same pointer types
	dataptr2 = dataptr1.q;		// quad word pointer for intrinsic operations


/*  Start Transpose to do calculations on rows */

	m7 = m5 = *(dataptr2 + 9);		 // m03:m02|m01:m00 - first line (line 4)and copy into m5
	m7 = _m_punpcklwd(m7, *(dataptr2 + 11)); // m11:m01|m10:m00 - interleave first and second lines
 	m6 = m2 = *(dataptr2 + 13);		 // m23:m22|m21:m20 - third line (line 6)and copy into m2
	m6 = _m_punpcklwd(m6, *(dataptr2 + 15));  // m31:m21|m30:m20 - interleave third and fourth lines

	m1 = _m_punpckhdq(m7, m6);	// m31:m21|m11:m01 - interleave to produce result 2
	m7 = _m_punpckldq(m7, m6);	// m30:m20|m10:m00 - interleave to produce result 1

	m3 = *(dataptr2 + 11);		// m13:m12|m11:m10 - second line
	m0 = *(dataptr2 + 15);		// m33:m32|m31:m30 - fourth line

	m5 = _m_punpckhwd(m5, m3);	// m13:m03|m12:m02 - interleave first and second lines
	m2 = _m_punpckhwd(m2, m0);	// m33:m23|m32:m22 - interleave third and fourth lines
	*(dataptr2 + 9) = m7;		// write result 1
	*(dataptr2 + 11) = m1;		// write result 2

	m1 = _m_punpckhdq(m5, m2);	// m33:m23|m13:m03 - interleave to produce result 4
	m5 = _m_punpckldq(m5, m2);	// m32:m22|m12:m02 - interleave to produce result 3

	*(dataptr2 + 13) = m5;		// write result 3
	*(dataptr2 + 15) = m1;		// write result 4, last 4x4

	m2 = m7 = *(dataptr2 + 5);	// m23:m22|m21:m20 - third line
	m0 = m6 = *(dataptr2 + 1);	// m03:m02|m01:m00 - first line, 4x4

	m0 = _m_punpcklwd(m0, *(dataptr2 + 3));  // m11:m01|m10:m00 - interleave first and second lines
	m2 = _m_punpcklwd(m2, *(dataptr2 + 7));  // m31:m21|m30:m20 - interleave third and fourth lines

	m1 = *(dataptr2 + 8);		// n03:n02|n01:n00 - first line 
	m3 = *(dataptr2 + 12);		// n23:n22|n21:n20 - third line

	m4 = _m_punpckhdq(m0, m2);	// m31:m21|m11:m01 - interleave to produce second result
	m0 = _m_punpckldq(m0, m2);	// m30:m20|m10:m00 - interleave to produce first result

	m6 = _m_punpckhwd(m6, *(dataptr2 + 3));  // m13:m03|m12:m02 - interleave first and second lines
	m7 = _m_punpckhwd(m7, *(dataptr2 + 7));  // m33:m23|m32:m22 - interleave third and fourth lines

	m2 = m1;			// copy first line
	m5 = m6;			// copy first intermediate result

	*(dataptr2 + 8) = m0;		// write result 1
	m0 = m3;			// copy third line

	m5 = _m_punpckhdq(m6, m7);	// m33:m23|m13:m03 - produce third result
	m6 = _m_punpckldq(m6, m7);	// m32:m22|m12:m02 - produce fourth result

	m1 = _m_punpcklwd(m1, *(dataptr2 + 10));  // n11:n01|n10:n00 - interleave first and second lines
	m2 = _m_punpckhwd(m2, *(dataptr2 + 10));  // n13:n03|n12:n02 - interleave first and second lines

	m0 = _m_punpckhwd(m0, *(dataptr2 + 14));  // n33:n23|n32:n22 - interleave third and fourth lines
	m3 = _m_punpcklwd(m3, *(dataptr2 + 14));  // n31:n21|n30:n20 - interleave third and fourth lines

	*(dataptr2 + 10) = m4;		// write result 2 out
	*(dataptr2 + 12) = m6;		// write result 3 out
	*(dataptr2 + 14) = m5;		// write result 4 out

	m4 = _m_punpckhdq(m1, m3);	// n31:n21|n11:n01- produce second result
	m1 = _m_punpckldq(m1, m3);	// n30:n20|n10:n00 - produce first result

	m6 = _m_punpckhdq(m2, m0);	// n33:n23|n13:n03 - produce fourth result
	m2 = _m_punpckldq(m2, m0);	// n32:n22|n12:n02- produce third result

	*(dataptr2 + 1) = m1;		// write result 5 out - (first result for other 4 x 4 block)
	*(dataptr2 + 3) = m4;		// write result 6 out
	*(dataptr2 + 5) = m2;		// write result 7 out
	*(dataptr2 + 7) = m6;		// write result 8 out

// Do first 4x4 quadrant, which is used in the beginning of the DCT:

	m0 = m2 = *(dataptr2);		// m03:m02|m01:m00 - first line, first 4x4
	m7 = m4 = *(dataptr2 + 4);	// m23:m22|m21:m20 - third line

	m0 = _m_punpcklwd(m0, *(dataptr2 + 2));  // m11:m01|m10:m00 - interleave first and second lines
	m7 = _m_punpcklwd(m7, *(dataptr2 + 6));  // m31:m21|m30:m20 - interleave third and fourth lines

	m6 = *(dataptr2 + 2);		// m13:m12|m11:m10 - second line
	m5 = *(dataptr2 + 6);		// m33:m32|m31:m30 - fourth line

	m1 = _m_punpckhdq(m0, m7);	// m31:m21|m11:m01 - interleave to produce result 2
	m0 = _m_punpckldq(m0, m7);	// m30:m20|m10:m00 - interleave to produce result 1

	*(dataptr2 + 0) = m0;		// write result 1
	*(dataptr2 + 2) = m1;		// write result 2

	m7 = m0;			// save result1
	m2 = m3 = _m_punpckhwd(m2, m6);		// m13:m03|m12:m02 - interleave first and second lines

	m7 = _m_psubw(m7, *(dataptr2 + 14));	// tmp07=x0-x7	/* Stage 1 */
	m6 = m1;				// write result 2
	m0 = _m_paddw(m0, *(dataptr2 + 14));	// tmp00=x0+x7	/* Stage 1 */
	m4 = _m_punpckhwd(m4, m5);   		// m33:m23|m32:m22 - interleave third and fourth lines

	m1 = _m_paddw(m1, *(dataptr2 + 12));	// tmp01=x1+x6	/* Stage 1 */
	m3 = m2;				// copy first intermediate result
	m6 = _m_psubw(m6, *(dataptr2 + 12));	// tmp06=x1-x6	/* Stage 1 */
	m2 = _m_punpckldq(m2, m4);		// m32:m22|m12:m02 - interleave to produce result 3


	*(dataptr2 + 4) = m2;			// write result 3
	m3 = _m_punpckhdq(m3, m4);		// m33:m23|m13:m03 - interleave to produce result 4
	*(dataptr2 + 6) = m3;			// write result 4


/************************************************************************************************
					End of Transpose

************************************************************************************************/

	// Even
	m0 = _m_paddw(*dataptr2, *(dataptr2 + 14));			// column 1 + column 8 (tmp0)
	m1 = _m_psubw(*dataptr2, *(dataptr2 + 14));			// column 1 - column 8 (tmp7)

	m2 = _m_paddw(*(dataptr2 + 2), *(dataptr2 + 12));		// column 2 + column 7 (tmp1)
	m3 = _m_psubw(*(dataptr2 + 2), *(dataptr2 + 12));		// column 2 - column 7 (tmp6)

	m4 = _m_paddw(*(dataptr2 + 4), *(dataptr2 + 10));		// column 3 + column 6 (tmp2)
	m5 = _m_psubw(*(dataptr2 + 4), *(dataptr2 + 10));		// column 3 - column 6 (tmp5)

	m6 = _m_paddw(*(dataptr2 + 6), *(dataptr2 + 8));		// column 4 + column 5 (tmp3)
	m7 = _m_psubw(*(dataptr2 + 6), *(dataptr2 + 8));		// column 4 - column 5 (tmp4)
     
	m8 = _m_paddw(m0, m6);  		// tmp10
	m9 = _m_paddw(m2, m4);  		// tmp11

	m0 = _m_psubw(m0, m6);  		// tmp13
	m2 = _m_psubw(m2, m4);  		// tmp12

	*(dataptr2 + 0) = _m_paddw(m8, m9);	// write out results
	*(dataptr2 + 8) = _m_psubw(m8, m9);	// write out results

	m8 = _m_paddw(m0, m2);			// tmp12 + tmp13
	m8 = _m_psllw(m8, 3);			// m8 * 2^3
	m8 = _m_pmulhw(m8, FIX_0_7);		// z1
	*(dataptr2 + 4) = _m_paddw(m0, m8);	// write out results
	*(dataptr2 + 12) = _m_psubw(m0, m8);	// write out results
   
    /* Odd part */

	m8 = _m_paddw(m7, m5);			// tmp10
	m9 = _m_paddw(m5, m3);			// tmp11
	m2 = _m_paddw(m3, m1);			// tmp12

	m0 = _m_psubw(m8, m2);			// tmp10 - tmp12
	m0 = _m_psllw(m0, 3);			// m8 * 2^3
	m0 = _m_pmulhw(m0, FIX_0_3);		// z5

	m5 = _m_psllw(m8, 3);
	m5 = _m_pmulhw(m5, FIX_0_5);
	m5 = _m_paddw(m5, m0);			// z2

	m3 = _m_psllw(m2, 3);
	m3 = _m_pmulhw(m3, FIX_1_3);
	m3 = _m_paddw(m3, m0);			// z4

	m4 = _m_psllw(m9, 3);
	m4 = _m_pmulhw(m4, FIX_0_7);		// z3

	m6 = _m_paddw(m1, m4);			// z11
	m7 = _m_psubw(m1, m4);			// z13

	*(dataptr2 + 10) = _m_paddw(m7, m5);	// Write out results
	*(dataptr2 + 6)  = _m_psubw(m7, m5);	// Write out results
	*(dataptr2 + 2)  = _m_paddw(m6, m3);	// Write out results
	*(dataptr2 + 14) = _m_psubw(m6, m3);	// Write out results

	dataptr2++;

	// Odd

	m0 = _m_paddw(*dataptr2, *(dataptr2 + 14));			// column 1 + column 8 (tmp0)
	m1 = _m_psubw(*dataptr2, *(dataptr2 + 14));			// column 1 - column 8 (tmp7)

	m2 = _m_paddw(*(dataptr2 + 2), *(dataptr2 + 12));		// column 2 + column 7 (tmp1)
	m3 = _m_psubw(*(dataptr2 + 2), *(dataptr2 + 12));		// column 2 - column 7 (tmp6)

	m4 = _m_paddw(*(dataptr2 + 4), *(dataptr2 + 10));		// column 3 + column 6 (tmp2)
	m5 = _m_psubw(*(dataptr2 + 4), *(dataptr2 + 10));		// column 3 - column 6 (tmp5)

	m6 = _m_paddw(*(dataptr2 + 6), *(dataptr2 + 8));		// column 4 + column 5 (tmp3)
	m7 = _m_psubw(*(dataptr2 + 6), *(dataptr2 + 8));		// column 4 - column 5 (tmp4)
     
	m8 = _m_paddw(m0, m6);  // tmp10
	m9 = _m_paddw(m2, m4);  // tmp11

	m0 = _m_psubw(m0, m6);  // tmp13
	m2 = _m_psubw(m2, m4);  // tmp12

	*(dataptr2 + 0) = _m_paddw(m8, m9);	// add and write out results
	*(dataptr2 + 8) = _m_psubw(m8, m9);	// subtract and write out results

	m8 = _m_paddw(m0, m2);			// tmp12 + tmp13
	m8 = _m_psllw(m8, 3);			// m8 * 2^3
	m8 = _m_pmulhw(m8, FIX_0_7);		// z1
	*(dataptr2 + 4) = _m_paddw(m0, m8);	// Write out results
	*(dataptr2 + 12) = _m_psubw(m0, m8);	// Write out results
   
	m8 = _m_paddw(m7, m5);	// tmp10
	m9 = _m_paddw(m5, m3);	// tmp11
	m2 = _m_paddw(m3, m1);	// tmp12

	m0 = _m_psubw(m8, m2);			// tmp10 - tmp12
	m0 = _m_psllw(m0, 3);			// m8 * 2^3
	m0 = _m_pmulhw(m0, FIX_0_3);		// z5

	m5 = _m_psllw(m8, 3);			// prepare for multiply 
	m5 = _m_pmulhw(m5, FIX_0_5);		// multiply by converted real
	m5 = _m_paddw(m5, m0);			// z2

	m3 = _m_psllw(m2, 3);			// prepare for multiply of converted real
	m3 = _m_pmulhw(m3, FIX_1_3);		// multiply
	m3 = _m_paddw(m3, m0);			// z4

	m4 = _m_psllw(m9, 3);			// prepare for multiply of converted real
	m4 = _m_pmulhw(m4, FIX_0_7);		// z3

	m6 = _m_paddw(m1, m4);			// z11
	m7 = _m_psubw(m1, m4);			// z13

	*(dataptr2 + 10) = _m_paddw(m7, m5);	// add and write out results
	*(dataptr2 + 6)  = _m_psubw(m7, m5);	// subtract and  write out results
	*(dataptr2 + 2)  = _m_paddw(m6, m3);	// add and write out results
	*(dataptr2 + 14) = _m_psubw(m6, m3);	// subtract and  write out results


/*  Transpose back to original column order */

	dataptr2 = dataptr1.q;				// point to start of buffer

	m7 = m5 = *(dataptr2 + 9);		 // m03:m02|m01:m00 - first line (line 4)and copy into m5
	m7 = _m_punpcklwd(m7, *(dataptr2 + 11)); // m11:m01|m10:m00 - interleave first and second lines
 	m6 = m2 = *(dataptr2 + 13);		 // m23:m22|m21:m20 - third line (line 6)and copy into m2
	m6 = _m_punpcklwd(m6, *(dataptr2 + 15));  // m31:m21|m30:m20 - interleave third and fourth lines

	m1 = _m_punpckhdq(m7, m6);	// m31:m21|m11:m01 - interleave to produce result 2
	m7 = _m_punpckldq(m7, m6);	// m30:m20|m10:m00 - interleave to produce result 1

	m3 = *(dataptr2 + 11);		// m13:m12|m11:m10 - second line
	m0 = *(dataptr2 + 15);		// m33:m32|m31:m30 - fourth line

	m5 = _m_punpckhwd(m5, m3);	// m13:m03|m12:m02 - interleave first and second lines
	m2 = _m_punpckhwd(m2, m0);	// m33:m23|m32:m22 - interleave third and fourth lines
	*(dataptr2 + 9) = m7;		// write result 1
	*(dataptr2 + 11) = m1;		// write result 2

	m1 = _m_punpckhdq(m5, m2);	// m33:m23|m13:m03 - interleave to produce result 4
	m5 = _m_punpckldq(m5, m2);	// m32:m22|m12:m02 - interleave to produce result 3

	*(dataptr2 + 13) = m5;		// write result 3
	*(dataptr2 + 15) = m1;		// write result 4, last 4x4

	m2 = m7 = *(dataptr2 + 5);	// m23:m22|m21:m20 - third line
	m0 = m6 = *(dataptr2 + 1);	// m03:m02|m01:m00 - first line, 4x4

	m0 = _m_punpcklwd(m0, *(dataptr2 + 3));  // m11:m01|m10:m00 - interleave first and second lines
	m2 = _m_punpcklwd(m2, *(dataptr2 + 7));  // m31:m21|m30:m20 - interleave third and fourth lines

	m1 = *(dataptr2 + 8);		// n03:n02|n01:n00 - first line 
	m3 = *(dataptr2 + 12);		// n23:n22|n21:n20 - third line

	m4 = _m_punpckhdq(m0, m2);	// m31:m21|m11:m01 - interleave to produce second result
	m0 = _m_punpckldq(m0, m2);	// m30:m20|m10:m00 - interleave to produce first result

	m6 = _m_punpckhwd(m6, *(dataptr2 + 3));  // m13:m03|m12:m02 - interleave first and second lines
	m7 = _m_punpckhwd(m7, *(dataptr2 + 7));  // m33:m23|m32:m22 - interleave third and fourth lines

	m2 = m1;			// copy first line
	m5 = m6;			// copy first intermediate result

	*(dataptr2 + 8) = m0;		// write result 1
	m0 = m3;			// copy third line

	m5 = _m_punpckhdq(m6, m7);	// m33:m23|m13:m03 - produce third result
	m6 = _m_punpckldq(m6, m7);	// m32:m22|m12:m02 - produce fourth result

	m1 = _m_punpcklwd(m1, *(dataptr2 + 10));  // n11:n01|n10:n00 - interleave first and second lines
	m2 = _m_punpckhwd(m2, *(dataptr2 + 10));  // n13:n03|n12:n02 - interleave first and second lines

	m0 = _m_punpckhwd(m0, *(dataptr2 + 14));  // n33:n23|n32:n22 - interleave third and fourth lines
	m3 = _m_punpcklwd(m3, *(dataptr2 + 14));  // n31:n21|n30:n20 - interleave third and fourth lines

	*(dataptr2 + 10) = m4;		// write result 2 out
	*(dataptr2 + 12) = m6;		// write result 3 out
	*(dataptr2 + 14) = m5;		// write result 4 out

	m4 = _m_punpckhdq(m1, m3);	// n31:n21|n11:n01- produce second result
	m1 = _m_punpckldq(m1, m3);	// n30:n20|n10:n00 - produce first result

	m6 = _m_punpckhdq(m2, m0);	// n33:n23|n13:n03 - produce fourth result
	m2 = _m_punpckldq(m2, m0);	// n32:n22|n12:n02- produce third result

	*(dataptr2 + 1) = m1;		// write result 5 out - (first result for other 4 x 4 block)
	*(dataptr2 + 3) = m4;		// write result 6 out
	*(dataptr2 + 5) = m2;		// write result 7 out
	*(dataptr2 + 7) = m6;		// write result 8 out

// Do first 4x4 quadrant, which is used in the beginning of the DCT:

	m0 = m2 = *(dataptr2);		// m03:m02|m01:m00 - first line, first 4x4
	m7 = m4 = *(dataptr2 + 4);	// m23:m22|m21:m20 - third line

	m0 = _m_punpcklwd(m0, *(dataptr2 + 2));  // m11:m01|m10:m00 - interleave first and second lines
	m7 = _m_punpcklwd(m7, *(dataptr2 + 6));  // m31:m21|m30:m20 - interleave third and fourth lines

	m6 = *(dataptr2 + 2);		// m13:m12|m11:m10 - second line
	m5 = *(dataptr2 + 6);		// m33:m32|m31:m30 - fourth line

	m1 = _m_punpckhdq(m0, m7);	// m31:m21|m11:m01 - interleave to produce result 2
	m0 = _m_punpckldq(m0, m7);	// m30:m20|m10:m00 - interleave to produce result 1

	*(dataptr2 + 0) = m0;		// write result 1
	*(dataptr2 + 2) = m1;		// write result 2

	m7 = m0;			// save result1
	m2 = m3 = _m_punpckhwd(m2, m6);		// m13:m03|m12:m02 - interleave first and second lines

	m7 = _m_psubw(m7, *(dataptr2 + 14));	// tmp07=x0-x7	/* Stage 1 */
	m6 = m1;				// write result 2
	m0 = _m_paddw(m0, *(dataptr2 + 14));	// tmp00=x0+x7	/* Stage 1 */
	m4 = _m_punpckhwd(m4, m5);   		// m33:m23|m32:m22 - interleave third and fourth lines

	m1 = _m_paddw(m1, *(dataptr2 + 12));	// tmp01=x1+x6	/* Stage 1 */
	m3 = m2;				// copy first intermediate result
	m6 = _m_psubw(m6, *(dataptr2 + 12));	// tmp06=x1-x6	/* Stage 1 */
	m2 = _m_punpckldq(m2, m4);		// m32:m22|m12:m02 - interleave to produce result 3


	*(dataptr2 + 4) = m2;			// write result 3
	m3 = _m_punpckhdq(m3, m4);		// m33:m23|m13:m03 - interleave to produce result 4
	*(dataptr2 + 6) = m3;			// write result 4


/************************************************************************************************
					End of Transpose 2

************************************************************************************************/


  /* Pass 2: process columns. */

  dataptr = data;
  dataptr1.w = dataptr;
  dataptr2 = dataptr1.q;


	// Even
	m0 = _m_paddw(*dataptr2, *(dataptr2 + 14));			// column 1 + column 8 (tmp0)
	m1 = _m_psubw(*dataptr2, *(dataptr2 + 14));			// column 1 - column 8 (tmp7)

	m2 = _m_paddw(*(dataptr2 + 2), *(dataptr2 + 12));		// column 2 + column 7 (tmp1)
	m3 = _m_psubw(*(dataptr2 + 2), *(dataptr2 + 12));		// column 2 - column 7 (tmp6)

	m4 = _m_paddw(*(dataptr2 + 4), *(dataptr2 + 10));		// column 3 + column 6 (tmp2)
	m5 = _m_psubw(*(dataptr2 + 4), *(dataptr2 + 10));		// column 3 - column 6 (tmp5)

	m6 = _m_paddw(*(dataptr2 + 6), *(dataptr2 + 8));		// column 4 + column 5 (tmp3)
	m7 = _m_psubw(*(dataptr2 + 6), *(dataptr2 + 8));		// column 4 - column 5 (tmp4)
     
	m8 = _m_paddw(m0, m6);  		// tmp10
	m9 = _m_paddw(m2, m4);  		// tmp11

	m0 = _m_psubw(m0, m6);  		// tmp13
	m2 = _m_psubw(m2, m4);  		// tmp12

	*(dataptr2 + 0) = _m_paddw(m8, m9);	// tmp10 + tmp11 &amp; write result
	*(dataptr2 + 8) = _m_psubw(m8, m9);	// tmp10 - tmp11 &amp; write result

	m8 = _m_paddw(m0, m2);			// tmp12 + tmp13
	m8 = _m_psllw(m8, 3);			// m8 * 2^3
	m8 = _m_pmulhw(m8, FIX_0_7);		// z1
	*(dataptr2 + 4) = _m_paddw(m0, m8);	// tmp13 + tmp10 &amp; write result
	*(dataptr2 + 12) = _m_psubw(m0, m8);	// tmp13 - tmp10 &amp; write result
   
    /* Odd part */

	m8 = _m_paddw(m7, m5);			// tmp10
	m9 = _m_paddw(m5, m3);			// tmp11
	m2 = _m_paddw(m3, m1);			// tmp12

	m0 = _m_psubw(m8, m2);			// tmp10 - tmp12
	m0 = _m_psllw(m0, 3);			// m8 * 2^3
	m0 = _m_pmulhw(m0, FIX_0_3);		// z5

	m5 = _m_psllw(m8, 3);			// shift to prepare for multiply
	m5 = _m_pmulhw(m5, FIX_0_5);		// multiply by constant (real * 2^13)
	m5 = _m_paddw(m5, m0);			// z2

	m3 = _m_psllw(m2, 3);			// shift to prepare for multiply
	m3 = _m_pmulhw(m3, FIX_1_3);		// multiply by constant (real * 2^13)
	m3 = _m_paddw(m3, m0);			// z4

	m4 = _m_psllw(m9, 3);			// shift to prepare for multiply
	m4 = _m_pmulhw(m4, FIX_0_7);		// z3

	m6 = _m_paddw(m1, m4);			// z11
	m7 = _m_psubw(m1, m4);			// z13

	*(dataptr2 + 10) = _m_paddw(m7, m5);	// z13 + z2 &amp; write out
	*(dataptr2 + 6)  = _m_psubw(m7, m5);	// z13 - z2 &amp; write out
	*(dataptr2 + 2)  = _m_paddw(m6, m3);	// z11 + z4 &amp; write out
	*(dataptr2 + 14) = _m_psubw(m6, m3);	// z11 - z4 &amp; write out

	dataptr2++;				// increment buffer pointer

	// Odd

	m0 = _m_paddw(*dataptr2, *(dataptr2 + 14));			// column 1 + column 8 (tmp0)
	m1 = _m_psubw(*dataptr2, *(dataptr2 + 14));			// column 1 - column 8 (tmp7)

	m2 = _m_paddw(*(dataptr2 + 2), *(dataptr2 + 12));		// column 2 + column 7 (tmp1)
	m3 = _m_psubw(*(dataptr2 + 2), *(dataptr2 + 12));		// column 2 - column 7 (tmp6)

	m4 = _m_paddw(*(dataptr2 + 4), *(dataptr2 + 10));		// column 3 + column 6 (tmp2)
	m5 = _m_psubw(*(dataptr2 + 4), *(dataptr2 + 10));		// column 3 - column 6 (tmp5)

	m6 = _m_paddw(*(dataptr2 + 6), *(dataptr2 + 8));		// column 4 + column 5 (tmp3)
	m7 = _m_psubw(*(dataptr2 + 6), *(dataptr2 + 8));		// column 4 - column 5 (tmp4)
     
	m8 = _m_paddw(m0, m6);  		// tmp10
	m9 = _m_paddw(m2, m4);  		// tmp11

	m0 = _m_psubw(m0, m6);  		// tmp13
	m2 = _m_psubw(m2, m4);  		// tmp12

	*(dataptr2 + 0) = _m_paddw(m8, m9);	// tmp10 + tmp11 &amp; write out
	*(dataptr2 + 8) = _m_psubw(m8, m9);	// tmp10 - tmp11 &amp; write out

	m8 = _m_paddw(m0, m2);			// tmp12 + tmp13
	m8 = _m_psllw(m8, 3);			// m8 * 2^3
	m8 = _m_pmulhw(m8, FIX_0_7);		// z1
	*(dataptr2 + 4) = _m_paddw(m0, m8);	// tmp13 + tmp10 &amp; write out
	*(dataptr2 + 12) = _m_psubw(m0, m8);	// tmp13 - tmp10 &amp; write out
   
	m8 = _m_paddw(m7, m5);			// tmp10
	m9 = _m_paddw(m5, m3);			// tmp11
	m2 = _m_paddw(m3, m1);			// tmp12

	m0 = _m_psubw(m8, m2);			// tmp10 - tmp12
	m0 = _m_psllw(m0, 3);			// m8 * 2^3
	m0 = _m_pmulhw(m0, FIX_0_3);		// z5

	m5 = _m_psllw(m8, 3);			// shift left for multiply (* 2^3)
	m5 = _m_pmulhw(m5, FIX_0_5);		// multiply by fixed point constant
	m5 = _m_paddw(m5, m0);			// z2

	m3 = _m_psllw(m2, 3);			// shift left for multiply (* 2^3)
	m3 = _m_pmulhw(m3, FIX_1_3);		// multiply by fixed point constant
	m3 = _m_paddw(m3, m0);			// z4

	m4 = _m_psllw(m9, 3);			// shift left for multiply (* 2^3)
	m4 = _m_pmulhw(m4, FIX_0_7);		// z3

	m6 = _m_paddw(m1, m4);			// z11
	m7 = _m_psubw(m1, m4);			// z13

	*(dataptr2 + 10) = _m_paddw(m7, m5);	// z13 + z2 &amp; write out
	*(dataptr2 + 6)  = _m_psubw(m7, m5);	// z13 - z2 &amp; write out
	*(dataptr2 + 2)  = _m_paddw(m6, m3);	// z11 + z4 &amp; write out
	*(dataptr2 + 14) = _m_psubw(m6, m3);	// z11 - z4 &amp; write out
	_m_empty();				// EMMS

}</pre>

<h4 align="center"><font size="4"><sup>Code Listing 5: MMX&trade;
Technology Intrinsic Implementation of Fast DCT</sup></font></h4>
</body>
</html>
