Logo Search packages:      
Sourcecode: cb2bib version File versions

bibParser.cpp

/***************************************************************************
 *   Copyright (C) 2004-2009 by Pere Constans
 *   constans@molspaces.com
 *   cb2Bib version 1.3.0. Licensed under the GNU GPL version 3.
 *   See the LICENSE file that comes with this distribution.
 ***************************************************************************/
#include "bibParser.h"

#include "authorString.h"
#include "bibPreparser.h"
#include "citeIDMaker.h"
#include "heuristicBibParser.h"
#include "preprocess.h"
#include "pubmedXml.h"
#include "settings.h"

#include <QCoreApplication>
#include <QUrl>


bibParser::bibParser(QObject* parento) : coreBibParser(parento)
{
    // Creating journal name database
    QString journal_f = _settingsP->fileName("cb2Bib/JournalFile");
    _journal_dbP = new journalDB(journal_f);

    // Creating month list
    _month_dbP = new monthDB();

    // Creating CiteID maker
    _cite_idmP = new citeIDMaker(this);

    // Creating (external) reference preparser
    _preparserP = new bibPreparser(this);
    connect(_preparserP, SIGNAL(statusMessage(const QString&)), this, SIGNAL(statusMessage(const QString&)));

    // Creating stream preprocess object
    _preprocessP = new preprocess(this);

    // Creating heuristic bibliographic parser
    _heuristic_parserP = new heuristicBibParser(this);
}

bibParser::~bibParser()
{
    delete _journal_dbP;
    delete _month_dbP;
    delete _heuristic_parserP;
}

/**
    Process each field and set its final format
*/
00054 QString bibParser::parse(const QString& field, const QString& value, const QString& init_value)
{
    QString v = value;
    if (field == "file")
        return v.trimmed();
    v = removeTags(v);
    if (v.isEmpty())
        return (v);
    c2bUtils::fullBibToC2b(v);
    if (field == "author")
    {
        c2bUtils::debug(v);
        v = _authorString.toBibTeX(v, _settingsP->value("cb2Bib/UseFullNames").toBool());
    }
    else if (field == "addauthors")
    {
        c2bUtils::debug(v);
        if (init_value.isEmpty())
            v = _authorString.toBibTeX(v, _settingsP->value("cb2Bib/UseFullNames").toBool());
        else
            v = init_value + " and " + _authorString.toBibTeX(v, _settingsP->value("cb2Bib/UseFullNames").toBool());
    }
    else if (field == "editor")
    {
        c2bUtils::debug(v);
        v.remove(QRegExp("\\((Editor|Editors|Ed|Eds)\\.{0,1}\\)", Qt::CaseInsensitive));
        v = _authorString.toBibTeX(v, _settingsP->value("cb2Bib/UseFullNames").toBool());
    }
    else if (field == "addeditors")
    {
        c2bUtils::debug(v);
        v.remove(QRegExp("\\((Editor|Editors|Ed|Eds)\\.{0,1}\\)", Qt::CaseInsensitive));
        if (init_value.isEmpty())
            v = _authorString.toBibTeX(v, _settingsP->value("cb2Bib/UseFullNames").toBool());
        else
            v = init_value + " and " + _authorString.toBibTeX(v, _settingsP->value("cb2Bib/UseFullNames").toBool());
    }
    else if (field == "journal")
    {
        if (_settingsP->value("cb2Bib/SetJournalsToFullname").toBool())
            v = _journal_dbP->retrieveFull(v);
        else
            v = _journal_dbP->retrieve(v);
    }
    else if (field == "keywords")
    {
        v = v.split(QRegExp("[^\\w\\s-]"), QString::SkipEmptyParts).join(", ");
        v.replace(QRegExp("\\s+,"), ",");
    }
    else if (field == "month")
        v = _month_dbP->retrieve(v);
    // Fields edition and note require first letter capitalization
    else if (field == "edition")
        v = c2bUtils::setCapitalization(v);
    else if (field == "note")
        v = c2bUtils::setCapitalization(v);
    // Process pages, volume, number, and year to set hyphenation
    else if (field == "pages")
        v = adjacentNumbers(v);
    else if (field == "volume")
        v = adjacentNumbers(v);
    else if (field == "number")
        v = adjacentNumbers(v);
    else if (field == "year")
        v = adjacentNumbers(v);
    else if (field == "title" || field == "booktitle")
        v = c2bUtils::setCapitalization(v);
    else if (field == "addtitle")
    {
        v = c2bUtils::setCapitalization(v);
        if (!init_value.isEmpty())
            v = init_value + ": " + v;
    }
    return v.simplified();
}

/**
    Process each field and set its final format for a complete reference
*/
00133 bibReference& bibParser::parse(bibReference& reference)
{
    QMutableHashIterator<QString, QString> i(reference);
    while (i.hasNext())
    {
        i.next();
        const QString v = parse(i.key(), i.value());
        i.setValue(v);
    }
    return reference;
}

QString bibParser::setJournalsToFull(const QString& text)
{
    QString substituted_text = text;
    QRegExp jnre("\\bjournal\\s*=\\s*[\\{\"](.*)[\\}\"]", Qt::CaseInsensitive);
    jnre.setMinimal(true);
    int pos = 0;
    uint nj = 0;
    while (pos >= 0)
    {
        pos = jnre.indexIn(substituted_text, pos);
        if (pos > -1)
        {
            QString line = jnre.cap(0);
            const QString jn = jnre.cap(1);
            line.replace(jn, fullJournal(jn));
            substituted_text.replace(pos, jnre.matchedLength(), line);
            pos += line.length();
            nj++;
        }
        emit statusMessage(tr("Processed %1 journal names...").arg(nj));
        QCoreApplication::processEvents();
    }
    emit statusMessage(tr("Processed %1 journal names.").arg(nj));
    return (substituted_text);
}

QString bibParser::setJournalsToAbbreviated(const QString& text)
{
    QString substituted_text = text;
    QRegExp jnre("\\bjournal\\s*=\\s*[\\{\"](.*)[\\}\"]", Qt::CaseInsensitive);
    jnre.setMinimal(true);
    int pos = 0;
    uint nj = 0;
    while (pos >= 0)
    {
        pos = jnre.indexIn(substituted_text, pos);
        if (pos > -1)
        {
            QString line = jnre.cap(0);
            const QString jn = jnre.cap(1);
            line.replace(jn, abbreviatedJournal(jn));
            substituted_text.replace(pos, jnre.matchedLength(), line);
            pos += line.length();
            nj++;
        }
        emit statusMessage(tr("Processed %1 journal names...").arg(nj));
        QCoreApplication::processEvents();
    }
    emit statusMessage(tr("Processed %1 journal names.").arg(nj));
    return (substituted_text);
}

QString bibParser::excerpt(const QString& text, const QStringList& hints) const
{
    QString txt = removeTags(text);
    txt.replace(QRegExp("\\[Bibliographic Metadata.+/Bibliographic Metadata\\]"), " ");
    txt.replace(QRegExp("(http://|https://|ftp://|www\\.|ftp\\.)(www\\.|ftp\\.){0,1}\\S+"), " ");
    txt.replace(QRegExp("\\b[A-Z]+\\b"), " ");
    txt.replace(QRegExp("\\d"), " ");
    txt.replace(c2bUtils::nonLetter, " ");
    txt.replace(QRegExp("\\b\\w{1,2}\\b"), " ");
    txt.replace(QRegExp("\\b(about|and|are|com|for|from|how|into|that|the|their|this|where|with|www)\\b", Qt::CaseInsensitive), " ");
    txt.replace(QRegExp("\\b(january|february|march|april|may|june|july|august|september|october|november|december)\\b",
                        Qt::CaseInsensitive), " ");
    txt = hints.join(" ") + " " + txt;
    txt = txt.simplified();
    QStringList txtlist = txt.split(" ", QString::SkipEmptyParts);
    QStringList txtlistSimp;
    for (int i = 0; i < qMin(15, txtlist.count()); ++i)
        txtlistSimp.append(txtlist.at(i));
    return txtlistSimp.join(" ");
}

void bibParser::setField(const QString& name, const QString& value)
{
    _current_reference[name] = parse(name, value);
}

void bibParser::setReferenceType(const QString& type)
{
    _current_reference.typeName = type;
}

void bibParser::setCiteID()
{
    _current_reference.citeidName = _cite_idmP->makeCiteID(_current_reference);
}

QString bibParser::documentFilename(const QString& base_fn, const QString& bibtex_fn)
{
    if (base_fn.isEmpty())
        return base_fn;
    if (!_settingsP->value("cb2Bib/MovePdf").toBool())
    {
        QUrl u(base_fn);
        QString scheme = u.scheme();
        if (scheme == "file")
            return parse("file", u.toLocalFile());
        else
            return parse("file", base_fn);
    }
    QString filename = _current_reference.citeidName;
    if (filename.isEmpty())
        filename = "no_cite_id";
    QFileInfo fi(base_fn);
    QString docExtension = "." + fi.suffix().toLower();
    // Possible document extensions
    if (!docExtension.contains(QRegExp("^\\.\\w{2,4}$")))
        docExtension = ".pdf"; // Default
    if (docExtension == ".gz")
        if (fi.completeSuffix().toLower() == ".ps.gz")
            docExtension = ".ps.gz"; // Composite extension
    filename = c2bUtils::documentFilename(_settingsP->value("cb2Bib/RelativePdfDirectory").toBool(), bibtex_fn,
                                          _settingsP->fileName("cb2Bib/PdfDirectory"), filename + docExtension);
    return parse("file", filename);
}


/****************************************************************************

AUTOMATIC BIB CAPTION

*****************************************************************************/

void bibParser::preparse(const QString& text, QString* out_text)
{
    _preparserP->preparse(text, out_text);
}

/** \page bibproc Extracting Data from the Clipboard

    Clipboard contents is processed according to the following rules:

    - Perform external, user-defined preparsing on input stream. See \ref c2bconf_clipboard.

    - Perform user-defined substitutions on input stream. See \ref c2bconf_clipboard.

    - Check if input stream is already a BibTeX entry. If so, process entry.

    - Check if input stream is a PubMed - Medline Journal entry. If so, process entry.

    - Preprocess author names: PI JOAN III -> Pi III, J.
    (care of name prefixes, suffixes, and removal of ambiguities).


    If otherwise,

    - Extract DOI \n (DOI, URL and FILE/PDF are preprocessed, performed before
    the automatic recognition takes place.)

    - Extract URL

    - Remove leading and trailing white spaces, TABs and CRs.

    - "\r\n", "\n" and/or "\r" replaced by the line indicator tag "<NewLineN>".

    - Replace "\t" and ten or more consecutive "\s" by the tabular tag "<TabN>".

    - Simplify White Spaces

    - Start the automatic recognition engine.


    If the automatic recognition engine fails, optionally, a heuristic guessing
    will be performed. See also \ref heuristic_guess and \ref metadata.

*/
void bibParser::parse(const QString& text, QString* out_text, QString* out_tagtext)
{
    QString& otext = *out_text;
    otext = text;
    QString& ottext = *out_tagtext;
    ottext.clear();
    _preprocessP->preprocessText(out_text);

    if (hasBibTeX(otext))
    {
        _current_reference = wholeReference(otext);
        parse(_current_reference);
        currentReferenceUpdated();
        _auto_recognized_string = tr("Processed as 'BibTeX'.");
        _auto_recognized = true;
        emit statusMessage(_auto_recognized_string);
        return;
    }

    if (otext.contains("<PubmedArticle>"))
    {
        pubmedXml pxml(otext);
        _current_reference = pxml.reference();
        parse(_current_reference);
        currentReferenceUpdated();
        _auto_recognized_string = tr("Processed as 'PubMed XML'.");
        _auto_recognized = true;
        emit statusMessage(_auto_recognized_string);
        return;
    }

    if (otext.contains(QRegExp("^\\s*PMID\\s*-")))
    {
        _current_reference.typeName = "article";
        otext = ' ' + otext;
        ottext = otext;
        ottext.replace(QRegExp("[\\n\\r]\\s*([A-Z]{1,4}\\s*-)"), "][\\1");
        ottext = ottext.simplified();
        if (!ottext.contains(QRegExp("\\[FAU\\s+-")))
            ottext.replace(QRegExp("\\[(AU\\s*-\\s*[-'\\w]+)"), "[F\\1 ");
        QStringList fList = ottext.split("][");
        QString kw;
        QRegExp fld("^([A-Z]{1,4})\\s{0,1}-\\s*(.+)$");
        fld.setPatternSyntax(QRegExp::RegExp2);
        for (QStringList::Iterator it = fList.begin(); it != fList.end(); ++it)
        {
            if (fld.indexIn(*it) == -1)
                continue;
            const QString tag = fld.cap(1);
            QString value = fld.cap(2);
            if (tag == "AB")
                _current_reference["abstract"] = parse("abstract", value);
            else if (tag == "FAU")
                _current_reference["author"] = parse("addauthors", author::fromMedline(value),
                                                     _current_reference.value("author"));
            else if (tag == "TA")
                _current_reference["journal"] = parse("journal", value);
            else if (tag == "IP")
                _current_reference["number"] = parse("number", value);
            else if (tag == "PG")
                _current_reference["pages"] = parse("pages", value);
            else if (tag == "TI")
                _current_reference["title"] = parse("title", value);
            else if (tag == "PMID")
                _current_reference["url"] = parse("url", c2bUtils::pubmedUrl.arg(value));
            else if (tag == "VI")
                _current_reference["volume"] = parse("volume", value);
            else if (tag == "AID")
            {
                if (value.contains("[doi]"))
                    _current_reference["doi"] = parse("doi", value.remove("[doi]"));
            }
            else if (tag == "DP")
                _current_reference["year"] = parse("year", value.replace(QRegExp("^([\\d\\s]+).*$"), "\\1"));
            else if (tag == "MH")
                kw += "; " + value.trimmed();
        }
        if (!kw.isEmpty())
            _current_reference["keywords"] = parse("keywords", kw.remove(0, 2));
        currentReferenceUpdated();
        _auto_recognized_string = tr("Processed as 'PubMed - Medline Journals'.");
        _auto_recognized = true;
        emit statusMessage(_auto_recognized_string);
        return;
    }

    QRegExp rxdoi("(10.[\\d\\.]+/\\S+)");
    int ndoi = rxdoi.indexIn(otext);
    if (ndoi > -1)
    {
        QString cdoi = rxdoi.cap(1);
        // This happens when publishers set doi to title in metadata: <title>doi:10. ... </title>
        if (cdoi.endsWith("</title>"))
            cdoi.chop(8);
        _current_reference["doi"] = parse("doi", cdoi);
    }

    QRegExp rxarxiv("arXiv:([\\w\\./-]+)");
    int narxiv = rxarxiv.indexIn(otext);
    if (narxiv > -1)
    {
        _current_reference["journal"] = parse("journal", rxarxiv.cap(0));
        _current_reference["url"] = parse("url", QString("http://arxiv.org/abs/%1").arg(rxarxiv.cap(1)));
    }

    QRegExp rxhtml("((http://|https://|ftp://|www\\.|ftp\\.)(www\\.|ftp\\.){0,1}\\S+)");
    int nhtml = rxhtml.indexIn(otext);
    if (nhtml > -1)
        _current_reference["url"] = parse("url", rxhtml.cap(1));

    // Set tags and start regular expression extraction
    ottext = setTags(otext);
    QString regular_expression_f = _settingsP->fileName("cb2Bib/RegularExpressionFile");
    checkRegExpFile(regular_expression_f);
    QFile file(regular_expression_f);
    file.open(QIODevice::ReadOnly | QIODevice::Text);
    QString ItemX;
    QString line;
    QString reftype;
    QString fieldset;
    QTextStream stream(&file);
    stream.setCodec("UTF-8");
    stream.setAutoDetectUnicode(true);
    int nfilters = 0;

    while (!stream.atEnd())
    {
        line = stream.readLine();
        if (!(line.isEmpty() || line.contains(QRegExp("^#"))))
        {
            reftype = stream.readLine();
            fieldset = stream.readLine();
            ItemX = stream.readLine();

            c2bUtils::debug(tr("The RegExp file contains1: |%1|").arg(line));
            c2bUtils::debug(tr("The RegExp file contains2: |%1|").arg(reftype));
            c2bUtils::debug(tr("The RegExp file contains3: |%1|").arg(fieldset));
            c2bUtils::debug(tr("The RegExp file contains4: |%1|").arg(ItemX));

            QRegExp rx(ItemX);
            rx.setMinimal(true);
            if (!rx.isValid())
                qDebug(tr("[cb2Bib] RegExp |%1| is not valid.").arg(ItemX).toLatin1());

            const QStringList list = fieldset.split(' ', QString::SkipEmptyParts);
            const int efields = list.count();
            const int cfields = rx.numCaptures();
            int npos = rx.indexIn(ottext);
            c2bUtils::debug(tr("Expected Fields: |%1|").arg(efields));
            c2bUtils::debug(tr("Captured Fields: |%1|").arg(cfields));
            c2bUtils::debug(tr("Position: |%1|").arg(npos));
            if (efields != cfields)
            {
                qDebug(tr("[cb2Bib] RegExp |%1| is not valid. Mismatch between expected and actual captures.")
                       .arg(ItemX).toLatin1());
                npos = -1;
            }
            nfilters++;

            if (npos > -1)
            {
                for (int i = 0; i < cfields; i++)
                {
                    const QString& listi = list.at(i);
                    int ii = i + 1;
                    c2bUtils::debug(QString("Fields in Template %1: |%2|").arg(i).arg(rx.cap(ii)));
                    if (_field_re.indexIn(listi) > -1)
                    {
                        if (listi == "author")
                            // Reminder: "addauthors" requires to init _current_reference["author"]
                            _current_reference[listi] = parse("addauthors", rx.cap(ii), _current_reference.value(listi));
                        else if (listi == "editor")
                            // Reminder: "addeditors" requires to init _current_reference["editor"]
                            _current_reference[listi] = parse("addeditors", rx.cap(ii), _current_reference.value(listi));
                        else if (listi == "title")
                            // Reminder: "addtitle" requires to init _current_reference["title"]
                            _current_reference[listi] = parse("addtitle", rx.cap(ii), _current_reference.value(listi));
                        else
                            _current_reference[listi] = parse(listi, rx.cap(ii));
                    }
                }
                _current_reference.typeName = reftype;
                currentReferenceUpdated();
                _auto_recognized_string = tr("Processed as '%1'.").arg(line);
                _auto_recognized = true;
                emit statusMessage(_auto_recognized_string);
                file.close();
                return;
            }
        }
    }
    file.close();

    // Heuristic Bib Parsing
    if (_settingsP->value("cb2Bib/DoHeuristicGuess").toBool())
    {
        // Sometimes (if user is on tag mode) tag could be on otext. Revert tags here, just in case.
        const QString clean_text = removeTags(ottext);
        _heuristic_parserP->guessFields(clean_text, ottext);
        currentReferenceUpdated();
        _auto_recognized_string = tr("Applied %1 filters: No automatic format detection. %2 fields guessed.")
                                  .arg(nfilters).arg(fieldCount());
    }
    else
        _auto_recognized_string = tr("Applied %1 filters: No automatic format detection.").arg(nfilters);
    emit statusMessage(_auto_recognized_string);
}

void bibParser::checkRegExpFile(const QString& fn)
{
    if (fn.isEmpty())
    {
        qDebug(QObject::tr("[cb2Bib] No regular expression file especified.").toLatin1());
        return;
    }
    QFileInfo fi(fn);
    if (!fi.exists() || !fi.isReadable())
    {
        qDebug(QObject::tr("[cb2Bib] Could not open regular expression file %1 for reading.").arg(fn).toLatin1());
        return;
    }
}

void bibParser::guessFields(const QString& text)
{
    const QString clean_text = text.simplified();
    const QString tagged_text = setTags(text);
    _heuristic_parserP->guessFields(clean_text, tagged_text);
    currentReferenceUpdated();
    _auto_recognized_string = tr("%1 fields guessed.").arg(fieldCount());
    emit statusMessage(_auto_recognized_string);
}

QString bibParser::setTags(const QString& text) const
{
    QString tagged_text = text.trimmed();
    tagged_text.replace(QRegExp("\\r\\n"), "<found_new_line>");       // Windows new line
    tagged_text.replace(QRegExp("\\n"), "<found_new_line>");          // Linux new line, LF
    tagged_text.replace(QRegExp("\\r"), "<found_new_line>");          // OSX new line, CR
    QStringList spText = tagged_text.split("<found_new_line>");
    int n = spText.count();
    tagged_text = "";
    for (int i = 0; i < n - 1; i++)
        tagged_text += spText.at(i) + QString("<NewLine%1>").arg(i + 1);
    tagged_text += spText[n-1];
    spText = tagged_text.split(QRegExp("(\\s{10,}|\\t)"));
    n = spText.count();
    tagged_text = "";
    for (int i = 0; i < n - 1; i++)
        tagged_text += spText.at(i) + QString("<Tab%1>").arg(i + 1);
    tagged_text += spText[n-1];
    tagged_text = tagged_text.simplified();
    return tagged_text;
}

QString bibParser::removeTags(const QString& text) const
{
    QString clean = text;
    clean.remove("[[");
    clean.remove("]]");
    clean.replace(QRegExp("<NewLine\\d+>"), " ");
    clean.replace(QRegExp("<Tab\\d+>"), " ");
    clean = clean.simplified();
    return clean;
}

Generated by  Doxygen 1.6.0   Back to index