/*
 * playlistaspectsearchfilter.cpp
 *
 *  Created on: 18.04.2010
 *      Author: darkstar
 */

#include <qregexp.h>
#ifdef QTOPIA
#include <qtl.h>
#endif

#include "playlistaspectsearchfilter.h"
#include "sqlite3.h"
#include "debug.h"
#include "compathack.h"

#define cGramSize 3
#define cGramMinimalMatchThresholdPercentage 80

static QMap<MediaDatabase *, MediaNGramIndexManager *> indexManagers;

/* PlayListAspectSearchFilter */

PlayListAspectSearchFilter::PlayListAspectSearchFilter()
	:	QObject(),
	 	PlayListAspectExtension(),
		filterText_(""),
		dbFilterText_(QString::null),
		nGramSearchEnabled_(true)
{
}

PlayListAspectSearchFilter::~PlayListAspectSearchFilter()
{
}

void PlayListAspectSearchFilter::attachedToAspect()
{
	MediaDatabase *mediaDatabase = playListAspect()->playList()->mediaDatabase();

	if (!indexManagers.contains(mediaDatabase))
	{
		MediaNGramIndexManager *manager = new MediaNGramIndexManager(mediaDatabase);
		manager->attach(this);
		indexManagers[mediaDatabase] = manager;
	}
	else
	{
		MediaNGramIndexManager *manager = indexManagers[mediaDatabase];
		manager->attach(this);
	}
}

void PlayListAspectSearchFilter::removedFromAspect()
{
	MediaDatabase *mediaDatabase = playListAspect()->playList()->mediaDatabase();

	MediaNGramIndexManager *manager = indexManagers[mediaDatabase];
	manager->remove(this);

	if (manager->filterCount() == 0)
	{
		indexManagers.remove(mediaDatabase);
		delete manager;
	}
}

void PlayListAspectSearchFilter::setFilterText(const QString &filter)
{
	if (filter != filterText_)
	{
		filterText_ = filter;
		update();
	}
}

const QString PlayListAspectSearchFilter::outputViewSource()
{
	return playListAspect()->prefix() + "textfilter";
}

void PlayListAspectSearchFilter::reset()
{
	dbFilterText_ = QString::null;
	filterText_ = "";
}

void PlayListAspectSearchFilter::execute()
{
	DENTERMETHOD;

	QString inputView = inputViewSource();
	QString outputView = outputViewSource();

	if (dbFilterText_.isEmpty() || dbFilterText_ != filterText_)
	{
		if (nGramSearchEnabled_ && filterText_.length() >= cGramSize)
			nGramTextSearch(inputViewSource(), outputViewSource(), filterText_);
		else
			naiveTextSearch(inputViewSource(), outputViewSource(), filterText_);

		dbFilterText_ = filterText_;
	}

	DEXITMETHOD;
}

void PlayListAspectSearchFilter::naiveTextSearch(const QString &inputView, const QString &outputView, const QString &searchString)
{
	QString query =
		"DROP VIEW IF EXISTS " + outputView + "; "
		"CREATE TEMPORARY VIEW " + outputView + " AS "
		"SELECT " + inputView + ".playlist_id as playlist_id, " + inputView + ".media_id as media_id, idx, track, title, album, artist, genre, playtime, filesize "
		"FROM " + inputView + ", media "
		"WHERE " + inputView + ".media_id = media.media_id ";

	if (!searchString.isEmpty())
	{
		QString conditionString = "";
		QString safeLikeTokenString = "";

		// split input filter string into parts so we can build separate conditions for each...
		QStringList tokenList = MediaNGramIndexManager::tokenizeString(searchString);
		MediaNGramIndexManager::reorderTokensByLength(tokenList);

		int tokenCount = tokenList.count();

		// build condition string for view...
		for (int i = 0; i < tokenCount; ++i)
		{
			safeLikeTokenString = dbSafeString("%" + tokenList[i] + "%");

			//conditionString += "(title LIKE '" + safeLikeTokenString + "' OR artist LIKE '" + safeLikeTokenString + "' OR album LIKE '" + safeLikeTokenString + "')";

			// the following is faster but less accurate:
			conditionString += "(title || album || artist LIKE '" + safeLikeTokenString + "')";

			// only append AND if this is not the last search string
			if (i < tokenCount -1)
				conditionString += " AND ";
		}

		query += "AND (" + conditionString + ") ";
	}

	query += ";";

	DTIMERINIT(timer);
	sqlite3_exec(playListAspect()->playList()->mediaDatabase()->db(), query.toUtf8(), NULL, NULL, NULL);
	DOP(playListAspect()->playList()->mediaDatabase()->dbDebug(query));

	// the schema changed, so we need to finalize and re-prepare statements...
	playListAspect()->playList()->mediaDatabase()->ensurePreparedStatementsAreValid();

	DTIMERPRINT(timer, "query");
}

void PlayListAspectSearchFilter::nGramTextSearch(const QString &inputView, const QString &outputView, const QString &searchString)
{
	QString query =
		"DROP VIEW IF EXISTS " + outputView + "; "
		"CREATE TEMPORARY VIEW " + outputView + " AS ";

	if (searchString.isEmpty())
	{
		query +=
			"SELECT " + inputView + ".playlist_id as playlist_id, " + inputView + ".media_id as media_id, idx, track, title, album, artist, genre, playtime, filesize "
			"FROM " + inputView + ", media "
			"WHERE " + inputView + ".media_id = media.media_id ";
	}
	else
	{
		QString token;
		QString source = inputView;
		QString temp;
		QString nGramSet;
		QStringList nGrams;
		int nGramMatchMinCount;
		bool isLast;

		QStringList tokenList =	MediaNGramIndexManager::tokenizeString(
			MediaNGramIndexManager::removeDiacritics(
				MediaNGramIndexManager::removePunctuations(searchString)
			)
		);

		MediaNGramIndexManager::reorderTokensByLength(tokenList);

		int tokenCount = tokenList.count();

		// build condition string for view...
		for (int i = 0; i < tokenCount; ++i)
		{
			isLast = i == tokenCount - 1;

			nGrams.clear();
			token = dbSafeString(tokenList[i].lower());
			DPRINTF("token: %s", (const char*)token.toUtf8());
			MediaNGramIndexManager::createNGrams(token, nGrams, cGramSize, token.length() < cGramSize, false);
			nGramSet = "'" + nGrams.join("','") + "'";
			nGramMatchMinCount = qMax(1, (int)(((float)cGramMinimalMatchThresholdPercentage / 100) * nGrams.count()));

			temp =
				"SELECT"
				" " + inputView + ".playlist_id AS playlist_id,"
				" " + inputView + ".media_id AS media_id,"
				" " + inputView + ".idx AS idx ";

			if (isLast)
				temp +=	", track, title, album, artist, genre, playtime, filesize ";

			temp += "FROM " + source + ",";

			if (isLast)
				temp += "media,";

			temp +=
				" token_index,"
				" ngram_index "
				"WHERE ";

			if (isLast)
				temp += " media.media_id = " + inputView + ".media_id AND";

			temp +=
				" token_index.media_id = " + inputView + ".media_id AND"
				" ngram_index.token_id = token_index.token_id AND"
				" ngram_index.ngram IN (" + nGramSet + ") "
				"GROUP BY"
				" token_index.media_id "
				"HAVING"
				" COUNT(ngram_index.token_id) >= " + QString::number(nGramMatchMinCount);

			if (!isLast)
				source = "(" + temp + ") as " + inputView;
		}

		query += temp;
	}

	query += ";";

	DTIMERINIT(timer);
	sqlite3_exec(playListAspect()->playList()->mediaDatabase()->db(), query.toUtf8(), NULL, NULL, NULL);
	DOP(playListAspect()->playList()->mediaDatabase()->dbDebug(query));

	// the schema changed, so we need to finalize and re-prepare statements...
	playListAspect()->playList()->mediaDatabase()->ensurePreparedStatementsAreValid();

	DTIMERPRINT(timer, "query");
}

void PlayListAspectSearchFilter::saveExtensionState(QStringList &dst)
{
	dst.append(filterText_);
}

void PlayListAspectSearchFilter::loadExtensionState(QStringList &src)
{
	if (src.count() == 1)
		setFilterText(src[0]);
}

/* MediaNGramIndexManager */

MediaNGramIndexManager::MediaNGramIndexManager(MediaDatabase *mediaDatabase)
	:	QObject(),
		mediaDatabase_(mediaDatabase),
		stmtInsertNGramIndex_(NULL),
	 	stmtInsertTokenIndex_(NULL)
{
	connect(mediaDatabase_, SIGNAL(preparingStatements(sqlite3*)), this, SLOT(mediaDatabasePreparingStatements(sqlite3*)));
	connect(mediaDatabase_, SIGNAL(finalizingStatements(sqlite3*)), this, SLOT(mediaDatabaseFinalizingStatements(sqlite3*)));

	connect(mediaDatabase_, SIGNAL(mediaUpdated(Media *)), this, SLOT(mediaDatabaseMediaUpdated(Media *)));

	if (!nGramIndexExists())
		recreateNGramIndex(cGramSize);
}

MediaNGramIndexManager::~MediaNGramIndexManager()
{
	disconnect(mediaDatabase_, SIGNAL(finalizingStatements(sqlite3*)), this, SLOT(mediaDatabaseFinalizingStatements(sqlite3*)));
	disconnect(mediaDatabase_, SIGNAL(preparingStatements(sqlite3*)), this, SLOT(mediaDatabasePreparingStatements(sqlite3*)));
}

void MediaNGramIndexManager::attach(PlayListAspectSearchFilter *filter)
{
	list_.append(filter);
}

void MediaNGramIndexManager::remove(PlayListAspectSearchFilter *filter)
{
	list_.remove(filter);
}

QString MediaNGramIndexManager::removeDiacritics(const QString &input)
{
	QString output = "";
	QString decomposition;

#ifndef QTOPIA
	output.reserve(input.length() * 2); // Reserve for better performance...
#endif

	for (int i = 0; i < input.length(); ++i)
	{
		QChar ch = input.at(i);
		ushort c = ch.unicode();

		// Rewrite some important Umlaute and graphemes into Latin-1
		if (c == 0xC4)        // Ä
			output += "Ae";
		else if (c == 0xC6)   // Æ
			output += "AE";
		else if (c == 0xD6 || // Ö
				 c == 0xD8)   // Ø
			output += "Oe";
		else if (c == 0xDC)   // Ü
			output += "Ue";
		else if (c == 0xDF)   // ß
			output += "ss";
		else if (c == 0xE4 || // ä
				 c == 0xE6)   // æ
			output += "ae";
		else if (c == 0xF6 || // ö
				 c == 0xF8)   // ø
			output += "oe";
		else if (c == 0xFC)   // ü
			output += "ue";
		else
		{
			// For everything else use the first character of the decomposed form
			// if there exists a decomposition of the character. If not, just use
			// the original character. This should be safe for all alphabets.
			decomposition = ch.decomposition();
			ch = decomposition.isNull() ? ch : decomposition.at(0);
			output += ch;
			//DPRINTF("original char: '%s', now: '%s'", (const char*)QString(input.at(i)).toUtf8(), (const char*)QString(ch).toUtf8());
		}
	}

#ifndef QTOPIA
	output.squeeze();
#endif

	DPRINTF("removeDiacritics input:  '%s'", (const char*)input.toUtf8());
	DPRINTF("removeDiacritics output: '%s'", (const char*)output.toUtf8());
	return output;
}

QString MediaNGramIndexManager::removePunctuations(const QString &input)
{
	QString output = input;

	output.replace(QRegExp("[+&/\\\\=\\-\\_]"), " ");
	output.replace(QRegExp("[,.:!?#%´`'\"\\^°(){}\\[\\]<>*]"), "");

	output = output.simplified();

	DPRINTF("removePunctuations input:  '%s'", (const char*)input.toUtf8());
	DPRINTF("removePunctuations output: '%s'", (const char*)output.toUtf8());
	return output;
}

QStringList MediaNGramIndexManager::tokenizeString(const QString &input)
{
#ifdef QT4
	QStringList tokenList = input.simplified().split(" ", QString::SkipEmptyParts);
#else
	QStringList tokenList = QStringList::split(" ", input.simplified(), false);
#endif
	return tokenList;
}

void MediaNGramIndexManager::reorderTokensByLength(QStringList &tokenList)
{
	// Stupid but effective optimization:
	// Find the longest token and make it the first item in the list
	// in order to minimize the matches already in the most nested select.
	// SQLite does only provide the most efficient indices in the most nested select.
	int tokenCount = tokenList.count();
	int longestTokenIndex = 0;

	for (int i = 0; i < tokenCount; ++i)
		if (tokenList[i].length() > tokenList[longestTokenIndex].length())
			longestTokenIndex = i;

	if (longestTokenIndex > 0)
		qSwap(tokenList[0], tokenList[longestTokenIndex]);
}

void MediaNGramIndexManager::createNGrams(const QString &token, QStringList &outputList, int gramSize, bool padLeft, bool padRight)
{
	QString input = token;

	if (padLeft)
		input = input.rightJustify(input.length() + gramSize - 1, '%');

	if (padRight)
		input = input.leftJustify(input.length() + gramSize - 1, '%');

	gramSize = qMin(gramSize, input.length());

	DPRINTF("input: %s", (const char *)input.toUtf8());

	for (int i = 0; i < input.length() - gramSize + 1; ++i)
		outputList.append(input.mid(i, gramSize));
}

void MediaNGramIndexManager::tokenizeGrammifyAndInsert(const QString &input, const unsigned long mediaID, const int gramSize)
{
	QStringList tokenList = tokenizeString(removePunctuations(input));
	QStringList gramList;

    for (QStringList::Iterator it = tokenList.begin(); it != tokenList.end(); ++it)
    {
		if ((*it).length() < 2)
			continue;

    	sqlite3_bind_null(stmtInsertTokenIndex_, 1);
		sqlite3_bind_text(stmtInsertTokenIndex_, 2, (*it).toUtf8(), -1, SQLITE_TRANSIENT);
    	sqlite3_bind_int(stmtInsertTokenIndex_, 3, mediaID);
		sqlite3_step(stmtInsertTokenIndex_);
		unsigned long tokenID = sqlite3_last_insert_rowid(mediaDatabase_->db());
		sqlite3_reset(stmtInsertTokenIndex_);

    	createNGrams(removeDiacritics(*it).lower(), gramList, gramSize, true, false);
		for (QStringList::Iterator it2 = gramList.begin(); it2 != gramList.end(); ++it2)
		{
			sqlite3_bind_text(stmtInsertNGramIndex_, 1, (*it2).toUtf8(), -1, SQLITE_TRANSIENT);
			sqlite3_bind_int(stmtInsertNGramIndex_, 2, tokenID);
			sqlite3_step(stmtInsertNGramIndex_);
			sqlite3_reset(stmtInsertNGramIndex_);
		}

		gramList.clear();
    }
}

bool MediaNGramIndexManager::nGramIndexExists()
{
	return sqlite3_exec(mediaDatabase_->db(), "SELECT count(*) FROM ngram_index; SELECT count(*) FROM token_index;", NULL, NULL, NULL) == SQLITE_OK;
}

void MediaNGramIndexManager::recreateNGramIndex(const int gramSize)
{
	DENTERMETHOD;
	DTIMERINIT(timer);

	mediaDatabase_->beginUpdate();

	sqlite3_exec(
		mediaDatabase_->db(),
		"DROP TABLE IF EXISTS ngram_index;\n"
		"DROP TABLE IF EXISTS token_index;\n",
		NULL, NULL, NULL
	);
	createSchema();

	sqlite3_stmt *vm;
	sqlite3_prepare_v2(mediaDatabase_->db(), "SELECT media_id, title, album, artist FROM media;", -1, &vm, 0);

	unsigned long mediaID;
	QString title;
	QString album;
	QString artist;

	while (sqlite3_step(vm) == SQLITE_ROW)
	{
		mediaID = sqlite3_column_int(vm, 0);
		title = QString::fromUtf8((const char *)sqlite3_column_text(vm, 1));
		album = QString::fromUtf8((const char *)sqlite3_column_text(vm, 2));
		artist = QString::fromUtf8((const char *)sqlite3_column_text(vm, 3));

		tokenizeGrammifyAndInsert(title, mediaID, gramSize);
		tokenizeGrammifyAndInsert(album, mediaID, gramSize);
		tokenizeGrammifyAndInsert(artist, mediaID, gramSize);
	}

	sqlite3_finalize(vm);

	mediaDatabase_->endUpdate();
	mediaDatabase_->ensurePreparedStatementsAreValid();

	DTIMERPRINT(timer, "updateNGramIndex");
	DEXITMETHOD;
}

void MediaNGramIndexManager::createSchema()
{
	mediaDatabase_->beginUpdate();

	sqlite3_exec(
		mediaDatabase_->db(),
		"CREATE TABLE IF NOT EXISTS ngram_index (\n"
		"  ngram TEXT,\n"
		"  token_id INTEGER\n"
	    ");\n"

		"CREATE TABLE IF NOT EXISTS token_index (\n"
		"  token_id INTEGER PRIMARY KEY,\n"
		"  token TEXT,\n"
		"  media_id INTEGER\n"
	    ");\n"

		"CREATE INDEX IF NOT EXISTS idx_ngram_index_ngram ON ngram_index (ngram);\n"
		"CREATE INDEX IF NOT EXISTS idx_ngram_index_token_id ON ngram_index (token_id);\n"
		"CREATE INDEX IF NOT EXISTS idx_token_index_media_id ON token_index (media_id);\n",
		NULL, NULL, NULL
	);

	sqlite3_exec(
		mediaDatabase_->db(),
		"CREATE TRIGGER IF NOT EXISTS trg_delete_media_token_index AFTER DELETE ON media\n"
		"BEGIN\n"
		"	DELETE FROM token_index WHERE media_id = OLD.media_id;\n"
		"END;\n"

		"CREATE TRIGGER IF NOT EXISTS trg_delete_token_ngram_index AFTER DELETE ON token_index\n"
		"BEGIN\n"
		"	DELETE FROM ngram_index WHERE token_id = OLD.token_id;\n"
		"END;\n"

		"CREATE TRIGGER IF NOT EXISTS trg_insert_media_ngram_index AFTER INSERT ON media\n"
		"BEGIN\n"
		"	DELETE FROM token_index WHERE media_id = NEW.media_id;\n"
		"END;\n"

		"CREATE TRIGGER IF NOT EXISTS trg_update_media_ngram_index AFTER UPDATE OF title, album, artist ON media\n"
		"BEGIN\n"
		"	DELETE FROM token_index WHERE media_id = NEW.media_id;\n"
		"END;\n",
		NULL, NULL, NULL
	);

	mediaDatabase_->endUpdate();
	mediaDatabase_->ensurePreparedStatementsAreValid();
}

void MediaNGramIndexManager::mediaDatabasePreparingStatements(sqlite3 *db)
{
	DENTERMETHOD;
	sqlite3_prepare_v2(db, "INSERT OR REPLACE INTO ngram_index VALUES(?1, ?2);", -1, &stmtInsertNGramIndex_, 0);
	sqlite3_prepare_v2(db, "INSERT OR REPLACE INTO token_index VALUES(?1, ?2, ?3);", -1, &stmtInsertTokenIndex_, 0);
	DEXITMETHOD;
}

void MediaNGramIndexManager::mediaDatabaseFinalizingStatements(sqlite3 *db)
{
	DENTERMETHOD;

	if (stmtInsertNGramIndex_)
	{
		sqlite3_finalize(stmtInsertNGramIndex_);
		stmtInsertNGramIndex_ = NULL;
	}

	if (stmtInsertTokenIndex_)
	{
		sqlite3_finalize(stmtInsertTokenIndex_);
		stmtInsertTokenIndex_ = NULL;
	}

	DEXITMETHOD;
}

void MediaNGramIndexManager::mediaDatabaseMediaUpdated(Media *media)
{
	DENTERMETHOD;
	unsigned long mediaID = media->mediaID();
	tokenizeGrammifyAndInsert(media->title(), mediaID, cGramSize);
	tokenizeGrammifyAndInsert(media->album(), mediaID, cGramSize);
	tokenizeGrammifyAndInsert(media->artist(), mediaID, cGramSize);
	DEXITMETHOD;
}
