Logo Search packages:      
Sourcecode: cb2bib version File versions

approximatePattern.cpp

/***************************************************************************
 *   Copyright (C) 2004-2009 by Pere Constans
 *   constans@molspaces.com
 *   cb2Bib version 1.3.0. Licensed under the GNU GPL version 3.
 *   See the LICENSE file that comes with this distribution.
 *
 *   Class implementation of the approximate search algorithm
 *   P. Constans. Approximate textual retrieval. arXiv:0705.0751v1, 2007.
 ***************************************************************************/
#include "approximatePattern.h"
#include "cb2bib_utilities.h"


approximatePattern::approximatePattern(const QString newPattern, Qt::CaseSensitivity newCs) : compositePattern(newPattern, newCs)
{
    pre = c2bUtils::fileToString(":/txt/txt/prefixes.txt").split(c2bUtils::nonLetter, QString::SkipEmptyParts);
    suf = c2bUtils::fileToString(":/txt/txt/suffixes.txt").split(c2bUtils::nonLetter, QString::SkipEmptyParts);
    setPattern();
}

approximatePattern::~approximatePattern()
{}


void approximatePattern::setPattern()
{
    // Exact match
    if (pattern.length() < 5)
    {
        arx.setPattern(escape(pattern, caseSensitivity));
        return;
    }

    // Single word: Allow 1 error (1 missing / 3 excess), anywhere
    if (wordCount(pattern) == 1)
    {
        arx.setPattern(wordPattern(pattern, caseSensitivity));
        return;
    }

    // Multiple words
    splitPattern();
    if (patternPieces.count() < 3) // Cases: "qq pp", "qqq pp", etc
    {
        arx.setPattern(wordPattern(pattern, caseSensitivity));
        return;
    }

    // Set parameters
    const double percentScan = 50.;
    double scanFactor = 100. / percentScan;
    int maxBlocks = patternPieces.count() / 2;
    int requestedBlocks = c2bUtils::nearInteger(scanFactor);
    int nBlocks = qMin(maxBlocks, 1 + requestedBlocks);
    int nPiecesPerBlock = patternPieces.count() / nBlocks;  // Ceiling
    if ((patternPieces.count() % nBlocks) > 0)
        nPiecesPerBlock++;

//    qDebug() << patternPieces << maxBlocks << scanFactor << requestedBlocks << nPiecesPerBlock << patternPieces.count() % nBlocks;
//    for (int b = 0; b < nBlocks; ++b)
//        for (int i = 0; i < nPiecesPerBlock; ++i)
//            if (b + (i*nBlocks) < patternPieces.count())
//                qDebug() << "block#   " << b << "pick item " <<  b + (i*nBlocks) << patternPieces.at(b + (i*nBlocks));

    QStringList subPatterns;
    QString _stretch = ".{0,%1}%2";
    QRegExp _asrx;
    _asrx.setPatternSyntax(QRegExp::RegExp2);
    _asrx.setMinimal(false);
    _asrx.setCaseSensitivity(Qt::CaseSensitive);
    QMap<double, QRegExp> sort_asrx;

    for (int b = 0; b < nBlocks; ++b)
    {
        int ii = b;
        QString sp = escape(patternPieces.at(ii), caseSensitivity);
        int stretch_product = 1;
        for (int j = 1; j < nPiecesPerBlock; ++j)
        {
            int jj = b + (j * nBlocks);
            if (jj < patternPieces.count())
            {
                int st = stretch(ii, jj);
                sp += _stretch.arg(st).arg(escape(patternPieces.at(jj), caseSensitivity));
                stretch_product *= st;
                ii = jj;
            }
        }
        subPatterns.append(sp);
        _asrx.setPattern(sp);
        // Sort according expectation (arXiv:0705.0751v1, Eq. 6)
        sort_asrx.insert(-stretch_product * probability(sp), _asrx);
    }
    asrx = sort_asrx.values();
    arx.setPattern(QString("(?:%1)").arg(subPatterns.join("|")));
}

void approximatePattern::splitPattern()
{
    if (pattern.isEmpty())
        return;

    QStringList wordList;
    QList<int> wordStarts;
    QList<int> wordEnds;
    splitPattern(pattern, &wordList, &wordStarts, &wordEnds);

    for (int i = 0; i < wordList.count(); ++i)
    {
        QStringList pieces = splitWord(wordList.at(i));
        QString prefix = pieces.at(0);
        if (prefix.length() > 2)
        {
            patternPieces.append(prefix);
            ppStarts.append(wordStarts.at(i));
            ppEnds.append(wordStarts.at(i) + prefix.length());
        }
        QString suffix = pieces.at(1);
        if (suffix.length() > 2)
        {
            patternPieces.append(suffix);
            ppStarts.append(wordEnds.at(i) - suffix.length());
            ppEnds.append(wordEnds.at(i));
        }
    }
}

void approximatePattern::splitPattern(const QString& p, QStringList* w, QList<int>* ws, QList<int>* we) const
{
    w->clear();
    ws->clear();
    we->clear();
    QString str = p;
    str.replace(c2bUtils::nonLetter, " ");
    str.append(' ');
    int w_starts = 0;
    int w_ends = 0;
    bool in_word = false;
    for (int i = 0; i < str.length(); ++i)
        if (str.at(i) == ' ')
        {
            if (in_word)
            {
                // Word actually ends at i - 1. However, this convention simplifies things a little bit.
                w_ends = i;
                w->append(str.mid(w_starts, w_ends - w_starts));
                ws->append(w_starts);
                we->append(w_ends);
            }
            in_word = false;
        }
        else
        {
            if (!in_word)
                w_starts = i;
            in_word = true;
        }
}

const QStringList approximatePattern::splitWord(const QString& word)
{
    // Returns the pair "prefix+root root+suffix"
    const int minimum_length = 5;
    int wlen = word.length();
    if (wlen < minimum_length)
        return QStringList() << word << word;

    QString w = word.toLower();
    int plen_min = wlen;
    int slen_min = wlen;
    int plen_max = 0;
    int slen_max = 0;
    for (int i = 0; i < pre.count(); ++i)
        if (w.startsWith(pre.at(i)))
        {
            int plen = pre.at(i).length();
            if (plen < plen_min)
                plen_min = plen;
            if (plen > plen_max)
                plen_max = plen;
        }
    for (int i = 0; i < suf.count(); ++i)
        if (w.endsWith(suf.at(i)))
        {
            int slen = suf.at(i).length();
            if (slen < slen_min)
                slen_min = slen;
            if (slen > slen_max)
                slen_max = slen;
        }
    if (plen_min == wlen)
        plen_min = 0;
    if (slen_min == wlen)
        slen_min = 0;

    w = word;
    int wlenMp_min = qMax(plen_min, wlen - slen_min);
    int wlenMs_min = qMax(slen_min, wlen - plen_min);
    int wlenMp_max = qMax(plen_max, wlen - slen_max);
    int wlenMs_max = qMax(slen_max, wlen - plen_max);

//    qDebug() << wlenMp_min << wlenMs_min << w.left(wlenMp_min) << w.right(wlenMs_min);
//    qDebug() << wlenMp_max << wlenMs_max << w.left(wlenMp_max) << w.right(wlenMs_max);

    if (wlenMp_max >= minimum_length && wlenMs_max >= minimum_length)
        return QStringList() << w.left(wlenMp_max) << w.right(wlenMs_max);
    else if (wlenMp_min >= minimum_length && wlenMs_min >= minimum_length)
        return QStringList() << w.left(wlenMp_min) << w.right(wlenMs_min);
    else
        return QStringList() << word << word;
}

QString approximatePattern::wordPattern(const QString& word, Qt::CaseSensitivity cs) const
{
    int len = word.length();
    QString _ord = word.right(len - 1);
    QStringList possible;
    possible.append(escape(_ord.left(len - 2), cs));
    for (int i = 1; i < len - 2; ++i)
        possible.append(QString("%1.{0,2}%2").arg(escape(_ord.left(len - i - 3), cs)).arg(escape(_ord.right(i), cs)));
//qDebug() << "WordPattern: " << QString("(?:%1(?:%2)|%3)").arg(escape(word.at(0), cs)).arg(possible.join("|")).arg(escape(_ord, cs));
    return QString("(?:%1(?:%2)|%3)").arg(escape(word.at(0), cs)).arg(possible.join("|")).arg(escape(_ord, cs));
}

int approximatePattern::wordCount(const QString& str) const
{
    const QString _str = QString(str).replace(c2bUtils::nonLetter, " ").simplified();
    return 1 + _str.count(' ');
}

int approximatePattern::stretch(int piece_i, int piece_j) const
{
    const int minStretch = 3;
    const int maxStretch = 20;
    if (patternPieces.at(piece_j).length() > 4)
        return qMax((maxStretch*(piece_j - piece_i)), minStretch*(ppStarts.at(piece_j) - ppEnds.at(piece_i)));
    else
        return minStretch*(ppStarts.at(piece_j) - ppEnds.at(piece_i));  // Estimated error ratio only
}

Generated by  Doxygen 1.6.0   Back to index