/*
 * Copyright (C) 2007-2008 Andre Beckedorf <andre@graphics32.org>
 *
 * Includes ported and optimized sourcecode of the Graphics32 project.
 * The code has been ported from Delphi to C++/Qt and was heavily optimized
 * for use on mobile devices. The most critical parts use integer-only
 * calculation for optimal performance on CPUs that don't feature an FPU
 * (most ARM-based CPUs for example).
 *
 * More information is available here:
 * http://www.graphics32.org
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 */

#include "imagefx.h"
#include <qrect.h>
#include <qpoint.h>
#include <qpainter.h>
#include <string.h>
#include <math.h>
#include <limits.h>
#include <debug.h>

#ifdef QTOPIA
#define USE_LUTLERP
#endif

inline QRgb setAlpha(QRgb rgb, unsigned int a)
{
    return ((a & 0xFF) << 24) | (rgb & 0x00FFFFFF);
}

QSize ImageFX::optimalFitSize(const QSize &srcSize, const QSize &maxDstSize)
{
	float cx, cy;
	float rscaleX, rscaleY;

	cx = srcSize.width();
	cy = srcSize.height();

	if (!cx || !cy)
		return QSize(0, 0);

	rscaleX = maxDstSize.width() / cx;
	rscaleY = maxDstSize.height() / cy;

	if (rscaleX >= rscaleY)
	{
		cx = cx * rscaleY;
		cy = maxDstSize.height();
	}
	else
	{
		cx = maxDstSize.width();
		cy = cy * rscaleX;
	}

	return QSize(cx, cy);
}

QImage ImageFX::createReflection(const QImage &srcImage, uint reflectionHeight, uint startAlpha, uint stopAlpha)
{
	uint width = srcImage.width();
	uint height = srcImage.height();

	if (reflectionHeight > height)
		reflectionHeight = height;

#ifdef QT4
	QImage newImage(width, height + reflectionHeight, QImage::Format_ARGB32);
	QPainter painter(&newImage);
	painter.drawImage(0, 0, srcImage);
#else
	QImage newImage(width, height + reflectionHeight, 32);
	bitBlt(&newImage, 0, 0, &srcImage);
	newImage.setAlphaBuffer(true);
#endif

	QRgb *src = NULL;
	QRgb *dst = (QRgb *)newImage.scanLine(height);

	// poor man's fixed point div...
	uint alpha = startAlpha << 24;
	uint alphaDelta = (((startAlpha - stopAlpha) << 24) / reflectionHeight);
	uchar *srcBits = newImage.bits();
	uint innerAlpha;

	for (uint y = 1; y <= reflectionHeight; ++y)
	{
		src = (QRgb *)(srcBits + ((height - y) * width << 2));
		alpha = alpha - alphaDelta;
		innerAlpha = alpha & 0xFF000000; // throw away precision...

		for (uint x = 0; x < width; ++x)
			*dst++ = (*src++ & 0x00FFFFFF) | innerAlpha;
 	}

	return newImage;
}


typedef int FixedMatrix[3][3];
typedef float FloatMatrix[3][3];

const FloatMatrix identityMatrix =
{
	{1, 0, 0},
	{0, 1, 0},
	{0, 0, 1}
};

const FloatMatrix emptyMatrix =
{
	{0, 0, 0},
	{0, 0, 0},
	{0, 0, 0}
};

// Fixed point math constants

const int FixedPrec = 16;
const int FixedOne = 1 << FixedPrec;
const float FixedToFloat = 1 / FixedOne;

const int LutLerpPrec = 14;

// Matrix conversion routines

inline void floatToFixedMatrix(const FloatMatrix srcFloatMatrix, FixedMatrix dstFixedMatrix)
{
	dstFixedMatrix[0][0] = (int)(srcFloatMatrix[0][0] * (float)FixedOne);
	dstFixedMatrix[0][1] = (int)(srcFloatMatrix[0][1] * (float)FixedOne);
	dstFixedMatrix[0][2] = (int)(srcFloatMatrix[0][2] * (float)FixedOne);
	dstFixedMatrix[1][0] = (int)(srcFloatMatrix[1][0] * (float)FixedOne);
	dstFixedMatrix[1][1] = (int)(srcFloatMatrix[1][1] * (float)FixedOne);
	dstFixedMatrix[1][2] = (int)(srcFloatMatrix[1][2] * (float)FixedOne);
	dstFixedMatrix[2][0] = (int)(srcFloatMatrix[2][0] * (float)FixedOne);
	dstFixedMatrix[2][1] = (int)(srcFloatMatrix[2][1] * (float)FixedOne);
	dstFixedMatrix[2][2] = (int)(srcFloatMatrix[2][2] * (float)FixedOne);
}

inline void fixedToFloatMatrix(const FixedMatrix srcFixedMatrix, FloatMatrix dstFloatMatrix)
{
	dstFloatMatrix[0][0] = (float)srcFixedMatrix[0][0] * FixedToFloat;
	dstFloatMatrix[0][1] = (float)srcFixedMatrix[0][1] * FixedToFloat;
	dstFloatMatrix[0][2] = (float)srcFixedMatrix[0][2] * FixedToFloat;
	dstFloatMatrix[1][0] = (float)srcFixedMatrix[1][0] * FixedToFloat;
	dstFloatMatrix[1][1] = (float)srcFixedMatrix[1][1] * FixedToFloat;
	dstFloatMatrix[1][2] = (float)srcFixedMatrix[1][2] * FixedToFloat;
	dstFloatMatrix[2][0] = (float)srcFixedMatrix[2][0] * FixedToFloat;
	dstFloatMatrix[2][1] = (float)srcFixedMatrix[2][1] * FixedToFloat;
	dstFloatMatrix[2][2] = (float)srcFixedMatrix[2][2] * FixedToFloat;
}

inline float _DET(float a1, float a2, float b1, float b2)
{
	return a1 * b2 - a2 * b1;
}

inline float _DET(float a1, float a2, float a3, float b1, float b2, float b3, float c1, float c2, float c3)
{
	return
		a1 * (b2 * c3 - b3 * c2) -
		b1 * (a2 * c3 - a3 * c2) +
		c1 * (a2 * b3 - a3 * b2);
}

void adjointFloatMatrix(FloatMatrix M)
{
	float a1, a2, a3;
	float b1, b2, b3;
	float c1, c2, c3;

	a1 = M[0][0]; a2 = M[0][1]; a3 = M[0][2];
	b1 = M[1][0]; b2 = M[1][1]; b3 = M[1][2];
	c1 = M[2][0]; c2 = M[2][1]; c3 = M[2][2];

	M[0][0] = _DET(b2, b3, c2, c3);
	M[0][1] =-_DET(a2, a3, c2, c3);
	M[0][2] = _DET(a2, a3, b2, b3);

	M[1][0] =-_DET(b1, b3, c1, c3);
	M[1][1] = _DET(a1, a3, c1, c3);
	M[1][2] =-_DET(a1, a3, b1, b3);

	M[2][0] = _DET(b1, b2, c1, c2);
	M[2][1] =-_DET(a1, a2, c1, c2);
	M[2][2] = _DET(a1, a2, b1, b2);
}

inline float determinantFloatMatrix(const FloatMatrix M)
{
	return _DET(
		M[0][0], M[1][0], M[2][0],
		M[0][1], M[1][1], M[2][1],
		M[0][2], M[1][2], M[2][2]
	);
}

inline void scaleFloatMatrix(FloatMatrix M, float factor)
{
	for (int i = 0; i < 3; ++i)
		for (int j = 0; j < 3; ++j)
			M[i][j] = M[i][j] * factor;
}

inline void copyFloatMatrix(const FloatMatrix src, FloatMatrix dst)
{
	for (int i = 0; i < 3; ++i)
		for (int j = 0; j < 3; ++j)
			dst[i][j] = src[i][j];
}

void invertFloatMatrix(FloatMatrix M)
{
	float det = determinantFloatMatrix(M);

	if (fabs(det) < 1E-5)
		copyFloatMatrix(identityMatrix, M);
	else
	{
		adjointFloatMatrix(M);
		scaleFloatMatrix(M, 1 / det);
	}
}

void multiplyFloatMatrices(const FloatMatrix M1, const FloatMatrix M2, FloatMatrix out)
{
	FloatMatrix result;

	for (int i = 0; i < 3; ++i)
		for (int j = 0; j < 3; ++j)
			result[i][j] =
				M1[0][j] * M2[i][0] +
				M1[1][j] * M2[i][1] +
				M1[2][j] * M2[i][2];

	copyFloatMatrix(result, out);
}

inline int min(int A, int B)
{
	return A > B ? B : A;
}

inline float min(float A, float B)
{
	return A > B ? B : A;
}

inline int max(int A, int B)
{
	return B > A ? B : A;
}

inline float max(float A, float B)
{
	return B > A ? B : A;
}

void debugPrintFloatMatrix(const QString &name, const FloatMatrix fm)
{
	DPRINTF("FloatMatrix %s", (const char*)name.utf8());
	for (int i = 0; i < 3; ++i)
		DPRINTF(" %f, %f, %f ", fm[i][0], fm[i][1], fm[i][2]);
}

void debugPrintFixedMatrix(const QString &name, const FixedMatrix fm)
{
	DPRINTF("FixedMatrix %s", (const char*)name.utf8());
	for (int i = 0; i < 3; ++i)
		DPRINTF(" %d, %d, %d ", fm[i][0], fm[i][1], fm[i][2]);
}

uint interpolateARGB(uint X, uint Y, uint weight)
{
	if (weight == 0)
		return Y;
	else if (weight == 255)
		return X;

	// combine ARGB channels of colors X and Y with the weight of X given in W
	// Result Z = W * X + (1 - W) * Y (all channels are combined, including alpha)

	// lazily emulating x86 ASM
	// compiled ARM code seems to be quite efficient.

	const uint bias = 0x00800080;
	uint EAX = X; // TODO: Use register keyword?
	uint EDX = Y;
	uint ECX = weight;
	uint EBX;

	// P = W * X
	EBX = EAX;                 // EBX  <-  Xa Xr Xg Xb
	EAX = EAX & 0x00FF00FF;    // EAX  <-  00 Xr 00 Xb
	EBX = EBX & 0xFF00FF00;    // EBX  <-  Xa 00 Xg 00
	EAX = EAX * ECX;           // EAX  <-  Pr ** Pb **
	EBX = EBX >> 8;            // EBX  <-  00 Xa 00 Xg
	EBX = EBX * ECX;           // EBX  <-  Pa ** Pg **
	EAX = EAX + bias;
	EAX = EAX & 0xFF00FF00;    // EAX  <-  Pa 00 Pg 00
	EAX = EAX >> 8;            // EAX  <-  00 Pr 00 Pb
	EBX = EBX + bias;
	EBX = EBX & 0xFF00FF00;    // EBX  <-  Pa 00 Pg 00
	EAX = EAX | EBX;           // EAX  <-  Pa Pr Pg Pb

	// W = 1 - W; Q = W * Y
	ECX = ECX ^ 0x000000FF;    // ECX  <-  1 - ECX
	EBX = EDX;                 // EBX  <-  Ya Yr Yg Yb
	EDX = EDX & 0x00FF00FF;    // EDX  <-  00 Yr 00 Yb
	EBX = EBX & 0xFF00FF00;    // EBX  <-  Ya 00 Yg 00
	EDX = EDX * ECX;           // EDX  <-  Qr ** Qb **
	EBX = EBX >> 8;            // EBX  <-  00 Ya 00 Yg
	EBX = EBX * ECX;           // EBX  <-  Qa ** Qg **
	EDX = EDX + bias;
	EDX = EDX & 0xFF00FF00;    // EDX  <-  Qr 00 Qb 00
	EDX = EDX >> 8;            // EDX  <-  00 Qr ** Qb
	EBX = EBX + bias;
	EBX = EBX & 0xFF00FF00;    // EBX  <-  Qa 00 Qg 00
	EBX = EBX | EDX;           // EBX  <-  Qa Qr Qg Qb

	// Z = P + Q (assuming no overflow at each byte)
	return EAX + EBX;          // EAX  <-  Za Zr Zg Zb
}

/*
inline uint blendARGB(uint ForegroundColor, uint BackgroundColor)
{
	register uint temp1, temp2;
	register uint weight = ForegroundColor >> 24; // weight (alpha 0-255)

	if (weight == 255)
		return ForegroundColor;
	else if (weight == 0)
		return BackgroundColor;
	else
	{
		temp1 = ((((ForegroundColor & 0xFF00FF00) >> 8) * weight) + 0x00800080) & 0xFF00FF00;
		ForegroundColor = (((ForegroundColor & 0x00FF00FF) * weight + 0x00800080) & 0xFF00FF00) >> 8;
		temp2 = ForegroundColor | temp1;

		weight = weight ^ 0x000000FF;

		temp1 = ((((BackgroundColor & 0xFF00FF00) >> 8) * weight) + 0x00800080) & 0xFF00FF00;
		BackgroundColor = (((BackgroundColor & 0x00FF00FF) * weight + 0x00800080) & 0xFF00FF00) >> 8;
		temp1 = temp1 | BackgroundColor;

		return temp2 + temp1;
	}
}
*/

uint blendARGB(uint ForegroundColor, uint BackgroundColor)
{
	// blend foreground color (F) to a background color (B),
	// using alpha channel value of F
	// Result Z = Fa * Frgb + (1 - Fa) * Brgb
	// EAX <- F
	// EDX <- B
	const uint bias = 0x00800080;
	uint EAX = ForegroundColor;
	uint EDX = BackgroundColor;
	uint ECX = ForegroundColor >> 24; // weight (alpha 0-255)
	uint EBX;

	if (ECX == 255)
		return EAX;
	else if (ECX == 0)
		return EDX;

	EBX = EAX;                 // EBX  <-  Fa Fr Fg Fb
	EAX = EAX & 0x00FF00FF;    // EAX  <-  00 Fr 00 Fb
	EBX = EBX & 0xFF00FF00;    // EBX  <-  Fa 00 Fg 00
	EAX = EAX * ECX;           // EAX  <-  Pr ** Pb **
	EBX = EBX >> 8;            // EBX  <-  00 Fa 00 Fg
	EBX = EBX * ECX;           // EBX  <-  Pa ** Pg **
	EAX = EAX + bias;
	EAX = EAX & 0xFF00FF00;    // EAX  <-  Pr 00 Pb 00
	EAX = EAX >> 8;            // EAX  <-  00 Pr ** Pb
	EBX = EBX + bias;
	EBX = EBX & 0xFF00FF00;    // EBX  <-  Pa 00 Pg 00
	EAX = EAX | EBX;           // EAX  <-  Pa Pr Pg Pb

	// W = 1 - W; Q = W * B
	ECX = ECX ^ 0x000000FF;    // ECX  <-  1 - ECX
	EBX = EDX;                 // EBX  <-  Ba Br Bg Bb
	EDX = EDX & 0x00FF00FF;    // EDX  <-  00 Br 00 Bb
	EBX = EBX & 0xFF00FF00;    // EBX  <-  Ba 00 Bg 00
	EDX = EDX * ECX;           // EDX  <-  Qr ** Qb **
	EBX = EBX >> 8;            // EBX  <-  00 Ba 00 Bg
	EBX = EBX * ECX;           // EBX  <-  Qa ** Qg **
	EDX = EDX + bias;
	EDX = EDX & 0xFF00FF00;    // EDX  <-  Qr 00 Qb 00
	EDX = EDX >> 8;            // EDX  <-  00 Qr ** Qb
	EBX = EBX + bias;
	EBX = EBX & 0xFF00FF00;    // EBX  <-  Qa 00 Qg 00
	EBX = EBX | EDX;           // EBX  <-  Qa Qr Qg Qb

	// Z = P + Q (assuming no overflow at each byte)
	return EAX + EBX;          // EAX  <-  Za Zr Zg Zb
}

uint blendARGB(uint ForegroundColor, uint BackgroundColor, uint MasterAlpha)
{
	// blend foregrownd color (F) to a background color (B),
	// using alpha channel value of F multiplied by master alpha (M)
	// no checking for M = $FF, if this is the case when Graphics32 uses BlendReg
	// Result Z = Fa * M * Frgb + (1 - Fa * M) * Brgb
	// EAX <- F
	// EDX <- B
	// ECX <- M

	const uint bias = 0x00800080;
	uint EAX = ForegroundColor;
	uint EDX = BackgroundColor;
	uint ECX = MasterAlpha;
	uint EBX;

	if (EAX & 0xFF000000 == 0)    // Test Fa = 0 ?
		return EDX;

	// Get weight W = Fa * M
	EBX = EAX;                 // EBX  <-  Fa Fr Fg Fb
	++ECX;                     // 255:256 range bias
	EBX = EBX >> 24;           // EBX  <-  00 00 00 Fa
	ECX = ECX * EBX;           // ECX  <-  00 00  W **
	ECX = ECX >> 8;            // ECX  <-  00 00 00  W
	if (!ECX)
		return EDX;

	// P = W * F
	EBX = EAX;                 // EBX  <-  ** Fr Fg Fb
	EAX = EAX & 0x00FF00FF;    // EAX  <-  00 Fr 00 Fb
	EBX = EBX & 0x0000FF00;    // EBX  <-  00 00 Fg 00
	EAX = EAX * ECX;           // EAX  <-  Pr ** Pb **
	EBX = EBX >> 8;            // EBX  <-  00 00 00 Fg
	EBX = EBX * ECX;           // EBX  <-  00 00 Pg **
	EAX = EAX + bias;
	EAX = EAX & 0xFF00FF00;    // EAX  <-  Pr 00 Pb 00
	EAX = EAX >> 8;            // EAX  <-  00 Pr ** Pb
	EBX = EBX + bias;
	EBX = EBX & 0x0000FF00;    // EBX  <-  00 00 Pg 00
	EAX = EAX | EBX;           // EAX  <-  00 Pr Pg Pb

	// W = 1 - W; Q = W * B
	ECX = ECX ^ 0x000000FF;    // ECX  <-  1 - ECX
	EBX = EDX;                 // EBX  <-  00 Br Bg Bb
	EDX = EDX & 0x00FF00FF;    // EDX  <-  00 Br 00 Bb
	EBX = EBX & 0x0000FF00;    // EBX  <-  00 00 Bg 00
	EDX = EDX * ECX;           // EDX  <-  Qr ** Qb **
	EBX = EBX >> 8;            // EBX  <-  00 00 00 Bg
	EBX = EBX * ECX;           // EBX  <-  00 00 Qg **
	EDX = EDX + bias;
	EDX = EDX & 0xFF00FF00;    // EDX  <-  Qr 00 Qb 00
	EDX = EDX >> 8;            // EDX  <-  00 Qr ** Qb
	EBX = EBX + bias;
	EBX = EBX & 0x0000FF00;    // EBX  <-  00 00 Qg 00
	EBX = EBX | EDX;           // EBX  <-  00 Qr Qg Qb

	// Z = P + Q (assuming no overflow at each byte)
	return EAX + EBX;          // EAX  <-  00 Zr Zg Zb
}

inline uint safePixel(const QImage &image, const int X, const int Y)
{
	return (X >= 0 && X < image.width() && Y >= 0 && Y < image.height())
		? *((uint *)(image.bits() + ((image.width() * Y + X) << 2)))
		: 0x00000000;
}

inline uint safePixel(const uchar* srcBits, const int srcStride, const int srcWidth, const int srcHeight, const int X, const int Y)
{
	return (X >= 0 && X < srcWidth && Y >= 0 && Y < srcHeight)
		? *((uint *)(srcBits + ((srcStride * Y + X) << 2)))
		: 0x00000000;
}

FloatPoint floatPoint(float X, float Y)
{
	FloatPoint result;
	result.X = X;
	result.Y = Y;
	return result;
}

void ImageFX::calculatePerspectivePoints(float posX, float posY, float width, float height, float perspective, float depth, float axisPos, bool radialMode, FloatPoint &p1, FloatPoint &p2, FloatPoint &p3, FloatPoint &p4)
{
	float persWidth, persHeight;
	float directionSwap, mirrorSwap;
	float scale;
	float leftPart, rightPart, topPart, bottomPart;

	static float recipMul[2] = {1 / 4, 1 / 360};
	static float validRange[2] = {4, 360};
	static float mirrorRange[2] = {2, 180};

	// convert Perspective to be within [ -ValidRange, ValidRange ]
	perspective = perspective - floorf(perspective * recipMul[radialMode]) * validRange[radialMode];

	if (fabs(perspective) > mirrorRange[radialMode])
		mirrorSwap = -1;
	else
		mirrorSwap = 1;

	if (perspective >= 0)
		directionSwap = 1;
	else
		directionSwap = -1;

	if (!radialMode)
	{
		perspective = fabs(perspective);

		if (perspective <= 2)
			scale = 1 - perspective;
		else
			scale = perspective - 3;
	}
	else
		scale = cosf(perspective * (M_PI / 180));

	persWidth = width * scale;
	persHeight = directionSwap * mirrorSwap * height * depth * (1 - fabs(scale));

	leftPart = -persWidth * axisPos;
	topPart = -persHeight * axisPos;

	axisPos = 1 - axisPos;
	rightPart = persWidth * axisPos;
	bottomPart = persHeight * axisPos;

	p1 = floatPoint(posX + leftPart, posY + topPart);
	p2 = floatPoint(posX + rightPart, posY + bottomPart);
	p3 = floatPoint(posX + rightPart, posY + height - bottomPart);
	p4 = floatPoint(posX + leftPart, posY + height - topPart);
}

#ifdef USE_FLOAT
void prepareProjectiveTransformationMatrix(float srcWidth, float srcHeight,
	const FloatPoint &p1, const FloatPoint &p2, const FloatPoint &p3, const FloatPoint &p4,
	QRect &boundsRect, FloatMatrix &inverseFloatMatrix)
#else
void prepareProjectiveTransformationMatrix(float srcWidth, float srcHeight,
	const FloatPoint &p1, const FloatPoint &p2, const FloatPoint &p3, const FloatPoint &p4,
	QRect &boundsRect, FixedMatrix &inverseFixedMatrix)
#endif
{
	float dx1, dx2, px, dy1, dy2, py;
	float g, h, k;
	FloatMatrix R;
	FloatMatrix matrix, inverseMatrix;

	float Wx0, Wx1, Wx2, Wx3;
	float Wy0, Wy1, Wy2, Wy3;

	Wx0 = p1.X; Wy0 = p1.Y;
	Wx1 = p2.X; Wy1 = p2.Y;
	Wx2 = p3.X; Wy2 = p3.Y;
	Wx3 = p4.X; Wy3 = p4.Y;

	px = Wx0 - Wx1 + Wx2 - Wx3;
	py = Wy0 - Wy1 + Wy2 - Wy3;

	if (px == 0 && py == 0)
	{
		// affine mapping
		matrix[0][0] = Wx1 - Wx0;
		matrix[1][0] = Wx2 - Wx1;
		matrix[2][0] = Wx0;

		matrix[0][1] = Wy1 - Wy0;
		matrix[1][1] = Wy2 - Wy1;
		matrix[2][1] = Wy0;

		matrix[0][2] = 0;
		matrix[1][2] = 0;
		matrix[2][2] = 1;
	}
	else
	{
		//projective mapping
	    dx1 = Wx1 - Wx2;
		dx2 = Wx3 - Wx2;
		dy1 = Wy1 - Wy2;
		dy2 = Wy3 - Wy2;
		k = dx1 * dy2 - dx2 * dy1;

		if (k != 0)
		{
			g = (px * dy2 - py * dx2) / k;
			h = (dx1 * py - dy1 * px) / k;

			matrix[0][0] = Wx1 - Wx0 + g * Wx1;
			matrix[1][0] = Wx3 - Wx0 + h * Wx3;
			matrix[2][0] = Wx0;

			matrix[0][1] = Wy1 - Wy0 + g * Wy1;
			matrix[1][1] = Wy3 - Wy0 + h * Wy3;
			matrix[2][1] = Wy0;

			matrix[0][2] = g;
			matrix[1][2] = h;
			matrix[2][2] = 1;
		}
		else
			copyFloatMatrix(emptyMatrix, matrix);
	}

	// denormalize texture space (u, v)
	copyFloatMatrix(identityMatrix, R);
	R[0][0] = 1.0 / (srcWidth - 1);  //(SrcRect.Right - SrcRect.Left);
	R[1][1] = 1.0 / (srcHeight - 1); //(SrcRect.Bottom - SrcRect.Top);
	multiplyFloatMatrices(matrix, R, matrix);

	copyFloatMatrix(identityMatrix, R);
	R[2][0] = 0; //-SrcRect.Left;
	R[2][1] = 0; //-SrcRect.Top;
	multiplyFloatMatrices(matrix, R, matrix);

	copyFloatMatrix(matrix, inverseMatrix);
	invertFloatMatrix(inverseMatrix);

#ifdef USE_FLOAT
	copyFloatMatrix(inverseMatrix, inverseFloatMatrix);
#else
	floatToFixedMatrix(inverseMatrix, inverseFixedMatrix);
#endif

	boundsRect.setLeft((int)(min(min(Wx0, Wx1), min(Wx2, Wx3)) - 0.5));
	boundsRect.setRight((int)(max(max(Wx0, Wx1), max(Wx2, Wx3)) + 0.5));
	boundsRect.setTop((int)(min(min(Wy0, Wy1), min(Wy2, Wy3)) - 0.5));
	boundsRect.setBottom((int)(max(max(Wy0, Wy1), max(Wy2, Wy3)) + 0.5));
	//qDebug("boundsRect (%d, %d, %d, %d)", boundsRect.left(), boundsRect.top(), boundsRect.right(), boundsRect.bottom());


	//qDebug("  %d  %d  %d", inverseFixedMatrix[0][0], inverseFixedMatrix[0][1], inverseFixedMatrix[0][2]);
	//qDebug("  %d  %d  %d", inverseFixedMatrix[1][0], inverseFixedMatrix[1][1], inverseFixedMatrix[1][2]);
	//qDebug("  %d  %d  %d", inverseFixedMatrix[2][0], inverseFixedMatrix[2][1], inverseFixedMatrix[2][2]);
}

static inline int lutLerp32(const int x, const int lut[], const int shift)
{
    int xa, ya, yb;

    xa = x >> shift;
    ya = lut[xa];
    yb = lut[xa + 1];

    return ya + ((yb - ya) * (x - (xa << shift)) >> shift);
}

#ifdef USE_LUTLERP
static int getLUTRange(int &minZ, int &shift, const FixedMatrix &inverseFixedMatrix,
	const int dstLeft, const int dstTop, const int dstRight, const int dstBottom)
{
	int MaxZ = INT_MIN;
	minZ = INT_MAX;

	int Z = inverseFixedMatrix[0][2] * dstLeft + inverseFixedMatrix[1][2] * dstTop + inverseFixedMatrix[2][2];
	minZ = QMIN(Z, minZ);
	MaxZ = QMAX(Z, MaxZ);

	Z = inverseFixedMatrix[0][2] * dstRight + inverseFixedMatrix[1][2] * dstTop + inverseFixedMatrix[2][2];
	minZ = QMIN(Z, minZ);
	MaxZ = QMAX(Z, MaxZ);

	Z = inverseFixedMatrix[0][2] * dstLeft + inverseFixedMatrix[1][2] * dstBottom + inverseFixedMatrix[2][2];
	minZ = QMIN(Z, minZ);
	MaxZ = QMAX(Z, MaxZ);

	Z = inverseFixedMatrix[0][2] * dstRight + inverseFixedMatrix[1][2] * dstBottom + inverseFixedMatrix[2][2];
	minZ = QMIN(Z, minZ);
	MaxZ = QMAX(Z, MaxZ);

	int range = MaxZ - minZ;
	//qDebug("range: %d", range);

	shift = 0;
	while ((range >> shift) > 128)
		++shift;

	//qDebug("shift: %d", shift);

	range = range >> shift;
	//qDebug("range: %d", range);

	return range;
}
#endif

class StraightPixelCopy
{
public:
	static inline void setPixel(uint *p1, const uint p2, void *)
	{
		*p1 = p2;
	}
};

class LUTPixelCopy
{
public:
	static inline void setPixel(uint *p1, const uint p2, void *data)
	{
		register uint *lut = (uint *)data;
		*p1 = qRgb(lut[qRed(p2)], lut[qGreen(p2)], lut[qBlue(p2)]);
	}
};

class BlendPixel
{
public:
	static inline void setPixel(uint *p1, const uint p2, void *)
	{
		*p1 = blendARGB(p2, *p1);
	}
};

template <class PixelFunction>
class ProjectiveImageTransformationNearestPixelProcessor
{
public:
	static inline void process(uint* dst,
#ifdef USE_FLOAT
		float srcX, float srcY,
#else
		int srcX, int srcY,
#endif
		const int srcLeft, const int srcTop, const int srcRight, const int srcBottom,
		const uchar *srcBits, const int srcStride, void *data)
	{
/*
#ifdef USE_FLOAT
		register int srcX = (int)fsrcX;
		register int srcY = (int)fsrcY;
#endif
*/

		if ((srcX >= srcLeft) && (srcX <= srcRight) && (srcY >= srcTop) && (srcY <= srcBottom))
			PixelFunction::setPixel(dst, *((uint *)(srcBits + ((srcStride * (int)srcY + (int)srcX) << 2))), data);
			//PixelFunction::setPixel(((uint *)(srcBits))[srcStride * srcY + srcX], dst);
	}
};

template <class PixelFunction>
class ProjectiveImageTransformationBilinearPixelProcessor
{
public:
	static inline void process(uint* dst,
#ifdef USE_FLOAT
		float fsrcX, float fsrcY,
#else
		int srcX, int srcY,
#endif
		const int srcLeft, const int srcTop, const int srcRight, const int srcBottom,
		const uchar *srcBits, const int srcStride, void *data)
	{
		uint celx, cely, c1, c2, c3, c4;
		uint *P;

#ifdef USE_FLOAT
		register int srcX = (int)fsrcX;
		register int srcY = (int)fsrcY;

		celx = ((int)(fsrcX * 256) & 0xFF) ^ 0xFF;
		cely = ((int)(fsrcY * 256) & 0xFF) ^ 0xFF;
#else
		celx = (srcX & 0xFF) ^ 0xFF;
		cely = (srcY & 0xFF) ^ 0xFF;

		srcX >>= 8;
		srcY >>= 8;
#endif

		if ((srcX > srcLeft) && (srcX < srcRight - 1) && (srcY > srcTop) && (srcY < srcBottom - 1))
		{
			P = (uint *)(srcBits + ((srcStride * srcY + srcX) << 2));
			c1 = *P++;
			c2 = *P;
			P += srcStride;
			c4 = *P--;
			c3 = *P;
			PixelFunction::setPixel(dst, interpolateARGB(interpolateARGB(c1, c2, celx), interpolateARGB(c3, c4, celx), cely), data);
		}
		else if ((srcX < srcLeft - 1) || (srcY < srcTop - 1) || (srcX > srcRight) || (srcY > srcBottom))
		{
		      // (X,Y) coordinate is out of the SrcRect, do not interpolate
		}
		else
		{
			int srcWidth = srcRight + 1;
			int srcHeight = srcBottom + 1;
			c1 = safePixel(srcBits, srcStride, srcWidth, srcHeight, srcX,     srcY);
			c2 = safePixel(srcBits, srcStride, srcWidth, srcHeight, srcX + 1, srcY);
			c3 = safePixel(srcBits, srcStride, srcWidth, srcHeight, srcX,     srcY + 1);
			c4 = safePixel(srcBits, srcStride, srcWidth, srcHeight, srcX + 1, srcY + 1);
			PixelFunction::setPixel(dst, interpolateARGB(interpolateARGB(c1, c2, celx), interpolateARGB(c3, c4, celx), cely), data);
		}
	}
};

template <int fixedPrec, int precRedux = 0>
class ProjectiveImageTransformationSourceCoordinateProcessor
{
public:
#ifdef USE_LUTLERP
	static inline void process(int &srcX, int &srcY, const int z, const int minZ, const int lut[], const int shift)
	{
		int zl = lutLerp32(z - minZ, lut, shift);
		srcX = ((srcX >> LutLerpPrec) * zl) >> (fixedPrec - precRedux);
		srcY = ((srcY >> LutLerpPrec) * zl) >> (fixedPrec - precRedux);
	}
#else
#ifdef USE_FLOAT
	static inline void process(float &srcX, float &srcY, const float z, float &prevZ, float &prevZL)
	{
		float zl;

		if (z == 0 || z == 1)
			return;

		if (z != prevZ)
		{
			prevZ = z;
			zl = 1 / z;
			prevZL = zl;
		}
		else
			zl = prevZL;

		srcX = srcX * zl;
		srcY = srcY * zl;
	}
#else
	static inline void process(int &srcX, int &srcY, const int z, int &prevZ, int &prevZL)
	{
		int zl;

		if (z == 0)
			return;

		if (z != FixedOne)
		{
			if (z != prevZ)
			{
				prevZ = z;
				zl = (FixedOne << LutLerpPrec) / z;
				prevZL = zl;
			}
			else
				zl = prevZL;

			srcX = ((srcX >> LutLerpPrec) * zl) >> (fixedPrec - precRedux);
			srcY = ((srcY >> LutLerpPrec) * zl) >> (fixedPrec - precRedux);
		}
		else
		{
			srcX = srcX >> (fixedPrec - precRedux);
			srcY = srcY >> (fixedPrec - precRedux);
		}
	}
#endif
#endif
};

template <template <class PixelFunction> class PixelProcessor, class PixelFunction, int fixedPrec, int precRedux = 0>
class ProjectiveImageTransformationPixelTransformer
{
public:
	static inline void process(uint* dst,
#ifdef USE_FLOAT
		const int dstX, const int dstY, const FloatMatrix &inverseMatrix,
#else
		const int dstX, const int dstY, const FixedMatrix &inverseMatrix,
#endif
#ifdef USE_LUTLERP
		const int minZ, const int lut[], const int shift,
#else
#ifdef USE_FLOAT
		float &prevZ, float &prevZL,
#else
		int &prevZ, int &prevZL,
#endif
#endif
		const int srcLeft, const int srcTop, const int srcRight, const int srcBottom,
		const uchar *srcBits, const int srcStride, void *data)
	{
		// reverse transform destination coordinate to srcImage space
#ifdef USE_FLOAT
		float z = inverseMatrix[0][2] * dstX + inverseMatrix[1][2] * dstY + inverseMatrix[2][2];
		float srcX = inverseMatrix[0][0] * dstX + inverseMatrix[1][0] * dstY + inverseMatrix[2][0];
		float srcY = inverseMatrix[0][1] * dstX + inverseMatrix[1][1] * dstY + inverseMatrix[2][1];
#else
		int z = inverseMatrix[0][2] * dstX + inverseMatrix[1][2] * dstY + inverseMatrix[2][2];
		int srcX = inverseMatrix[0][0] * dstX + inverseMatrix[1][0] * dstY + inverseMatrix[2][0];
		int srcY = inverseMatrix[0][1] * dstX + inverseMatrix[1][1] * dstY + inverseMatrix[2][1];
#endif

#ifdef USE_LUTLERP
		ProjectiveImageTransformationSourceCoordinateProcessor<fixedPrec, precRedux>::
			process(srcX, srcY, z, minZ, lut, shift);
#else
		ProjectiveImageTransformationSourceCoordinateProcessor<fixedPrec, precRedux>::
			process(srcX, srcY, z, prevZ, prevZL);
#endif

		PixelProcessor<PixelFunction>::
			process(dst, srcX, srcY, srcLeft, srcTop, srcRight, srcBottom, srcBits, srcStride, data);
	}
};

template <template <class PixelFunction> class PixelProcessor, class PixelFunction, int precRedux = 0>
class ProjectiveImageTransformationQImage
{
public:
#ifdef USE_FLOAT
	static inline void process(const QImage &srcImage, QImage &dstImage, const FloatMatrix &inverseMatrix, const QRect &dstRect, void *data)
#else
	static inline void process(const QImage &srcImage, QImage &dstImage, const FixedMatrix &inverseMatrix, const QRect &dstRect, void *data)
#endif
	{
		int srcTop = 0;
		int srcLeft = 0;
		int srcBottom = srcImage.height() - 1;
		int srcRight = srcImage.width() - 1;

		int dstTop = dstRect.top();
		int dstLeft = dstRect.left();
		int dstBottom = dstRect.bottom();
		int dstRight = dstRect.right();

#ifdef USE_LUTLERP
		int minZ = 0;
		int shift = 0;
		int range = getLUTRange(minZ, shift, inverseMatrix, dstLeft, dstTop, dstRight, dstBottom);
		int lut[range + 2];
		for (int x = 0; x < range + 2; ++x)
		{
			int div = minZ + (x << shift);
			if (!div) div = 1;
			lut[x] = (FixedOne << LutLerpPrec) / div;
		}
#else
#ifdef USE_FLOAT
		float prevZ = 0;
		float prevZL = 0;
#else
		int prevZ = 0;
		int prevZL = 0;
#endif
#endif

		uchar *srcBits = const_cast<QImage &>(srcImage).bits();
		int srcStride = srcImage.bytesPerLine() / 4;

		uchar *dstBits = dstImage.bits();
		int dstStride = dstImage.bytesPerLine() / 4;

		for (int dstY = dstTop; dstY <= dstBottom; ++dstY)
		{
			uint *dst = (uint *)(dstBits + ((dstStride * dstY + dstLeft) << 2));
			for (int dstX = dstLeft; dstX <= dstRight; ++dstX)
			{
				ProjectiveImageTransformationPixelTransformer<PixelProcessor, PixelFunction, FixedPrec, precRedux>::
					process(
						dst, dstX, dstY, inverseMatrix,
#ifdef USE_LUTLERP
						minZ, lut, shift,
#else
						prevZ, prevZL,
#endif
						srcLeft, srcTop, srcRight, srcBottom, srcBits, srcStride,
						data
					);

				++dst;
			}
		}
	}
};


void ImageFX::projectiveTransformation(const QImage &srcImage, QImage &dstImage, const QRect &dstClip,
	const FloatPoint &p1, const FloatPoint &p2, const FloatPoint &p3, const FloatPoint &p4, bool antialias, bool blend)
{
	QRect boundsRect;

#ifdef USE_FLOAT
	FloatMatrix inverseMatrix;
	prepareProjectiveTransformationMatrix(srcImage.width(), srcImage.height(), p1, p2, p3, p4, boundsRect, inverseMatrix);
#else
	FixedMatrix inverseMatrix;
	prepareProjectiveTransformationMatrix(srcImage.width(), srcImage.height(), p1, p2, p3, p4, boundsRect, inverseMatrix);
#endif

	//QRect DstRect = QRect(QPoint(0, 0), QPoint(dstImage.width() - 1, dstImage.height() - 1));

	QRect dstRect = dstClip;
	//qDebug("dstClip ^ dstRect (%d, %d, %d, %d)", dstClip.left(), dstClip.top(), dstClip.right(), dstClip.bottom());

	dstRect = dstRect.intersect(dstImage.rect());
	//qDebug("ImageBounds ^ dstRect (%d, %d, %d, %d)", DstRect.left(), DstRect.top(), DstRect.right(), DstRect.bottom());

	dstRect = dstRect.intersect(boundsRect);
	//qDebug("BoundsRect ^ dstRect (%d, %d, %d, %d)", DstRect.left(), DstRect.top(), DstRect.right(), DstRect.bottom());

	if (blend)
	{
		if (!antialias)
			ProjectiveImageTransformationQImage<ProjectiveImageTransformationNearestPixelProcessor, BlendPixel>::
				process(srcImage, dstImage, inverseMatrix, dstRect, NULL);
		else
			ProjectiveImageTransformationQImage<ProjectiveImageTransformationBilinearPixelProcessor, BlendPixel, 8>::
				process(srcImage, dstImage, inverseMatrix, dstRect, NULL);
	}
	else
	{
		if (!antialias)
			ProjectiveImageTransformationQImage<ProjectiveImageTransformationNearestPixelProcessor, StraightPixelCopy>::
				process(srcImage, dstImage, inverseMatrix, dstRect, NULL);
		else
			ProjectiveImageTransformationQImage<ProjectiveImageTransformationBilinearPixelProcessor, StraightPixelCopy, 8>::
				process(srcImage, dstImage, inverseMatrix, dstRect, NULL);
	}
}


void ImageFX::projectiveTransformationLUT(const QImage &srcImage, QImage &dstImage, const QRect &dstClip,
	const FloatPoint &p1, const FloatPoint &p2, const FloatPoint &p3, const FloatPoint &p4, bool antialias, uint *lut)
{
	QRect boundsRect;

#ifdef USE_FLOAT
	FloatMatrix inverseMatrix;
	prepareProjectiveTransformationMatrix(srcImage.width(), srcImage.height(), p1, p2, p3, p4, boundsRect, inverseMatrix);
#else
	FixedMatrix inverseMatrix;
	prepareProjectiveTransformationMatrix(srcImage.width(), srcImage.height(), p1, p2, p3, p4, boundsRect, inverseMatrix);
#endif

	QRect dstRect = dstClip;
	dstRect = dstRect.intersect(dstImage.rect());
	dstRect = dstRect.intersect(boundsRect);

	if (!antialias)
		ProjectiveImageTransformationQImage<ProjectiveImageTransformationNearestPixelProcessor, LUTPixelCopy>::
			process(srcImage, dstImage, inverseMatrix, dstRect, lut);
	else
		ProjectiveImageTransformationQImage<ProjectiveImageTransformationBilinearPixelProcessor, LUTPixelCopy, 8>::
			process(srcImage, dstImage, inverseMatrix, dstRect, lut);
}
