Logo Search packages:      
Sourcecode: cb2bib version File versions

c2bBibSearcher.cpp

/***************************************************************************
 *   Copyright (C) 2004-2009 by Pere Constans
 *   constans@molspaces.com
 *   cb2Bib version 1.3.0. Licensed under the GNU GPL version 3.
 *   See the LICENSE file that comes with this distribution.
 ***************************************************************************/
#include "c2bBibSearcher.h"

#include "c2b.h"
#include "c2bBibParser.h"
#include "c2bSettings.h"
#include "c2bUtils.h"

#include <QApplication>
#include <QDir>


/** \page c2bsearch Search BibTeX files for references

\section descrip Description

    - <b>Search pattern</b> \n Patterns and composite patterns can be either
    \htmlonly
    <a href="http://arxiv.org/abs/0705.0751v1" target="_blank">approximate strings</a>,
    \endhtmlonly
    strings, regular expressions, or wildcard filters. Patterns admit Unicode
    characters. The scope of each pattern can be the reference as a whole or be
    focused on a particular reference field. The fields <tt>year</tt>,
    <tt>file</tt>, and <tt>journal</tt> are treated specifically. The field
    <tt>year</tt> has the qualifiers <tt>Exact</tt>, <tt>Newer</tt>, and
    <tt>Older</tt>. The field <tt>file</tt> can optionally refer to either the
    filename or the contents of such a file. Finally, for <tt>journal</tt>, the
    input pattern is duplicated to the, if available, journal fullname, and
    they two are checked against the <tt>journal</tt> actual field contents
    and, if available, its expanded contents. For example, typing 'ijqc'
    retrieves all references with <tt>journal</tt> being 'Int. J. Quantum
    Chem.'. Or, typing 'chemistry' retrieves any of 'J. Math. Chem.', 'J. Phys.
    Chem.', etc. This expansion is not performed when the pattern scope is set
    to <tt>all</tt>.

    - <b>Search scope</b> \n By default, searches are performed on the current
    BibTeX output file. If <b>Scan all BibTeX files</b> is checked the search
    will extend to all BibTeX files, extension .bib, present in the current
    directory. It might be therefore convenient to group all reference files in
    one common directory, or have them linked to that directory. When <b>Scan
    linked documents</b> is checked, and one or more pattern scope is
    <tt>all</tt> or <tt>file</tt>, the contents of the file in <tt>file</tt> is
    converted to text and scanned for that given pattern. See \ref
    c2bconf_utilities section to configure the external to text converter.

    - <b>Search modifier</b> \n
    \htmlonly
    The cb2Bib converts TeX encoded characters to Unicode when parsing the
    references. This permits, for instance, for the pattern 'M&#248;ller' to
    retrieve either 'M&#248;ller' or 'M{\o}ller', without regard to how the
    BibTeX reference is written. By checking <b>Simplify source</b>, the
    reference and the converted PDF files are simplified to plain Ascii. Thus,
    the pattern '\bMoller\b' will hit any of 'M&#248;ller', 'M{\o}ller', or
    'Moller'. Additionally, all non-word characters are removed, preserving
    only the Ascii, word structure of the source. Note that source
    simplification is only performed for the patterns whose scope is
    <tt>all</tt> or <tt>file</tt> contents, and that and so far, the cb2Bib has
    only a subset of such conversions. Implemented TeX to Unicode conversions
    can be easily checked by entering a reference. The Unicode to Ascii
    letter-only conversion, on the other hand, is the one that the cb2Bib also
    uses to write the reference IDs and, hence, the renaming of dropped files.
    The cb2Bib can also understand minor sub and superscript formatting. For
    instance, the pattern 'H2O' will retrieve 'H<sub>2</sub>O' from a BibTeX
    string 'H$_{2}$O'.
    \endhtmlonly


\section notes Notes

    - The cb2Bib uses an internal cache to speed up the search of linked files.
    By default data is stored as <tt>current_file.bib.c2b</tt>. It might be
    more convenient, however, to setup a temporary directory out of the user
    data backup directories. See <b>Search In Files Cache Directory</b> in \ref
    c2bconf_files. When a linked file is processed for the first time, the
    cb2Bib does several string manipulations, such as removing end of line
    hyphenations. This process is time consuming for very large files.

    - The <b>approximate string</b> search is described in reference
    \htmlonly
    <a href="http://arxiv.org/abs/0705.0751v1" target="_blank">http://arxiv.org/abs/0705.0751v1</a>.
    \endhtmlonly
    It reduces the chance of missing a hit due to transcription and decoding
    errors in the document files. Approximate string is also a form of
    serendipitous information retrieval.


    <p>&nbsp;</p>
*/
c2bBibSearcher::c2bBibSearcher(QObject* parento) : QObject(parento)
{
    bp = c2b::bibParser();
}

c2bBibSearcher::~c2bBibSearcher()
{}


void c2bBibSearcher::addPattern(bool Not, bool caseSensitive, const QString& patternType, const QString& scope,
                                const QString& yearScope, const QString& pattern)
{
    patternList.append(c2bSearchPattern(Not, caseSensitive, patternType, scope, yearScope, pattern));
    if (!fieldList.contains(scope))
        fieldList.append(scope);
}

void c2bBibSearcher::exec()
{
    if (patternList.count() == 0)
        return;

    qSort(patternList);
    p_include_pdf = p_include_pdf && (fieldList.contains("all") || fieldList.contains("file"));
    if (p_include_pdf && !fieldList.contains("file"))
        fieldList.append("file");
    fieldList.removeAll("all");

    QString AndOr;
    if (p_boolean_and)
        AndOr = ".AND.";
    else
        AndOr = ".OR.";
    for (int i = 0; i < patternList.count(); ++i)
    {
        const c2bSearchPattern& pattern = patternList.at(i);
        logString += tr("% Pattern%1: %2%3\n").arg(i + 1).arg(AndOr).arg(pattern.toString());
    }

    // Search In Files
    QFileInfoList flist;
    if (p_all_bibtex_files)
    {
        QDir bibdir(p_bibtex_dir);
        flist = bibdir.entryInfoList(QStringList() << "*.bib");
    }
    else
        flist.append(QFileInfo(p_bibtex_file));

    for (int i = 0; i < flist.count(); ++i)
    {
        search(flist.at(i).absoluteFilePath());
        if (p_aborted)
        {
            clear();
            errorCounter = -1;
            return;
        }
    }

    // Search Done
    logString += tr("% Scanned References: %1  BibTeX Files: %2  Linked Files: %3\n").
                 arg(referenceCounter).arg(bibtexCounter).arg(pdfCounter);

    if (hitMap.count() == 0)
        return;
    hitString = "\n\n" + QStringList(hitMap.keys()).join("\n\n") + "\n\n";
    logString += tr("% Total Unique Hits: %1\n").arg(hitMap.count());
}

void c2bBibSearcher::abort()
{
    p_aborted = true;
}

void c2bBibSearcher::clear()
{
    bibtexCounter = 0;
    errorCounter = 0;
    fieldList.clear();
    hitMap.clear();
    hitString = "";
    logString = "% cb2Bib " + C2B_VERSION + " / BibTeX Search Log\n";
    p_aborted = false;
    patternList.clear();
    p_bibtex_dir = "";
    p_bibtex_file = "";
    p_boolean_and = true;
    pdfCounter = 0;
    p_simplify_source = false;
    referenceCounter = 0;
}

void c2bBibSearcher::search(const QString& bib_file)
{
    bibtexCounter++;
    QString bib_file_contents;
    QFile file(bib_file);
    if (!file.open(QIODevice::ReadOnly | QIODevice::Text))
    {
        errorCounter++;
        logString += tr("% [cb2bib] Unable to open the file %1 for reading. Error: '%2'.\n").arg(bib_file).arg(file.errorString());
        return;
    }
    QTextStream stream(&file);
    stream.setCodec("UTF-8");
    stream.setAutoDetectUnicode(true);
    bib_file_contents = stream.readAll();
    logString += tr("% Scanning file %1\n").arg(bib_file.trimmed());
    if (p_include_pdf)
        cache.load(bib_file, p_simplify_source);

    int hits = hitMap.count();
    bibReference ref;
    bp->initReferenceParsing(bib_file, fieldList, &ref);
    while (bp->referencesIn(bib_file_contents, &ref))
    {
        referenceCounter++;
        searchBib(bib_file, ref);
        QCoreApplication::processEvents();
        if (p_aborted)
            return;
    }
    logString += tr("% File %1. Hits: %2\n").arg(bib_file.trimmed()).arg(hitMap.count() - hits);
}

void c2bBibSearcher::searchBib(const QString& bib_file, const bibReference& ref)
{
    QString pdf_file_name = ref.value("file");
    QString pdf_file_contents = "unread";

    // Initialize composite search
    bool hit = p_boolean_and;

    // Composite search
    for (int i = 0; i < patternList.count(); ++i)
    {
        const c2bSearchPattern& pattern = patternList.at(i);
        bool ihit = false;

        if (pattern.scope == "year")
        {
            int istr = ref.value(pattern.scope).toInt();
            int jstr = pattern.pattern.toInt();
            if (pattern.yearScope == "=")
                ihit = istr == jstr;
            else if (pattern.yearScope == ">")
                ihit = istr >= jstr;
            else if (pattern.yearScope == "<")
                ihit = istr <= jstr;
        }
        else if (pattern.scope == "all")
        {
            if (p_simplify_source)
                ihit = pattern.matches(c2bUtils::toAscii(ref.unicodeReference, c2bUtils::KeepWords));
            else
                ihit = pattern.matches(ref.unicodeReference);
            if (!ihit)
                if (include_pdf(p_include_pdf, pdf_file_name, &pdf_file_contents))
                    ihit = pattern.matches(pdf_file_contents);
        }
        else if (pattern.scope == "journal")
        {
            QString pattern_full = bp->fullJournal(pattern.pattern);
            QString j_orig = ref.value(pattern.scope);
            QString j_full = bp->fullJournal(j_orig);
            ihit = j_orig.contains(pattern.rx) || j_full.contains(pattern.rx) || j_full == pattern_full;
        }
        else if (pattern.scope == "file" && include_pdf(p_include_pdf, pdf_file_name, &pdf_file_contents))
            ihit = pattern.matches(pdf_file_contents);
        else
            ihit = pattern.matches(ref.value(pattern.scope));

        if (pattern.Not)
            ihit = !ihit;
        if (p_boolean_and)
        {
            hit = hit && ihit;
            if (!hit)
                break;
        }
        else
        {
            hit = hit || ihit;
            if (hit)
                break;
        }
    }
    if (hit)
        if (!hitMap.contains(ref.rawReference))
        {
            if (include_pdf(p_include_pdf, pdf_file_name, &pdf_file_contents))
                hitMap.insert(ref.rawReference, location(bib_file, ref) + excerpts(pdf_file_contents));
            else
                hitMap.insert(ref.rawReference, location(bib_file, ref));
        }
}

const QString c2bBibSearcher::excerpts(const QString& contents)
{
    const int max_excerpts = 25;
    const int max_unmerged_excerpts = max_excerpts + 10;
    QMap<int, int> exc_endpos;

    for (int i = 0; i < patternList.count(); ++i)
    {
        const c2bSearchPattern& pattern = patternList.at(i);
        if (pattern.Not)
            continue;
        if (pattern.scope != "all" && pattern.scope != "file")
            continue;
        int n_excerpts = 0;
        int pos = 0;
        while (pos >= 0)
        {
            pos = pattern.rx.indexIn(contents, pos);
            if (pos > -1)
            {
                if (++n_excerpts > max_unmerged_excerpts)
                    break;
                exc_endpos.insert(pos, qMax(pos + pattern.rx.matchedLength(), exc_endpos.value(pos)));
                pos += pattern.rx.matchedLength();
            }
        }
    }
    if (exc_endpos.isEmpty())
        return QString();

    // Merge
    QList<int> i_pos = exc_endpos.keys();
    int pos0 = i_pos.at(0);
    for (int i = 1; i < i_pos.count(); ++i)
    {
        int posi = i_pos.at(i);
        if (exc_endpos.value(pos0) < posi)
            pos0 = posi;
        else
        {
            int endposi = exc_endpos.value(posi);
            exc_endpos.remove(posi);
            exc_endpos.insert(pos0, endposi);
        }
    }
    i_pos = exc_endpos.keys();

    const int context_length = 40;
    QString exc;
    QRegExp lead_truncated_words("^.*\\s(?=\\w)");
    lead_truncated_words.setMinimal(true);
    QRegExp tail_truncated_words("\\W+\\w+\\W*$");
    tail_truncated_words.setMinimal(true);

    bool item_begins = true;
    bool item_complete;
    int items = qMin(i_pos.count(), max_excerpts);

    for (int i = 0; i < items; ++i)
    {
        int pos = i_pos.at(i);
        int length = exc_endpos.value(pos) - pos;
        QString match =  "<span>" + c2bUtils::toHtmlString(contents.mid(pos, length)) + "</span>";

        if (item_begins)
        {
            QString pre_match = contents.mid(pos - context_length, context_length);
            pre_match.remove(lead_truncated_words);
            pre_match = c2bUtils::toHtmlString(pre_match);
            exc += "&#8226; ..." + pre_match + match;
        }
        else
            exc += match;

        if (i + 1 == items)
            item_complete = true;
        else
            item_complete = exc_endpos.value(pos) + (2 * context_length) < i_pos.at(i + 1);
        if (item_complete)
        {
            QString post_match = contents.mid(pos + length, context_length);
            post_match.remove(tail_truncated_words);
            post_match = c2bUtils::toHtmlString(post_match);
            exc += post_match + "... ";
            item_begins = true;
        }
        else
        {
            exc += contents.mid(pos + length, i_pos.at(i + 1) - pos - length);
            item_begins = false;
        }
    }

    exc = "</p><p id=\"excerpt\">" + exc;
    if (i_pos.count() >= max_excerpts)
        exc += tr("</p><p><b>Found more than %1 occurrences</b>.").arg(max_excerpts);
    return exc;
}

const QString c2bBibSearcher::highlight(const QString& abstract)
{
    if (abstract.isEmpty())
        return abstract;
    QMap<int, int> endpos;
    for (int i = 0; i < patternList.count(); ++i)
    {
        const c2bSearchPattern& pattern = patternList.at(i);
        if (pattern.Not)
            continue;
        if (pattern.scope != "all" && pattern.scope != "abstract")
            continue;
        int pos = 0;
        while (pos >= 0)
        {
            pos = pattern.rx.indexIn(abstract, pos);
            if (pos > -1)
            {
                endpos.insert(pos, qMax(pos + pattern.rx.matchedLength(), endpos.value(pos)));
                pos += pattern.rx.matchedLength();
            }
        }
    }
    if (endpos.isEmpty())
        return c2bUtils::toHtmlString(abstract);

    // Merge
    QList<int> i_pos = endpos.keys();
    int pos0 = i_pos.at(0);
    for (int i = 1; i < i_pos.count(); ++i)
    {
        int posi = i_pos.at(i);
        if (endpos.value(pos0) < posi)
            pos0 = posi;
        else
        {
            int endposi = endpos.value(posi);
            endpos.remove(posi);
            endpos.insert(pos0, endposi);
        }
    }
    i_pos = endpos.keys();

    QString hla;
    int npos = 0;
    for (int i = 0; i < i_pos.count(); ++i)
    {
        int pos = i_pos.at(i);
        int length = endpos.value(pos) - pos;
        hla +=  c2bUtils::toHtmlString(abstract.mid(npos, pos - npos)) + "<span>" +
                c2bUtils::toHtmlString(abstract.mid(pos, length)) + "</span>";
        npos = endpos.value(pos);
    }
    hla += c2bUtils::toHtmlString(abstract.mid(npos, abstract.length() - npos));
    return hla;
}

const QString c2bBibSearcher::location(const QString& fn, const bibReference& ref)
{
    QString at("<a href=\"Edit reference '%1' from file [%2:%3]\" class=\"anchor\">"
               "<img src=\":/icons/icons/edit16.png\" alt=\"action\" width=\"16\" height=\"16\" /></a>");
    return at.arg(ref.citeidName).arg(fn).arg(ref.positionValue);
}

Generated by  Doxygen 1.6.0   Back to index