Logo Search packages:      
Sourcecode: cb2bib version File versions

metadataParser.cpp

/***************************************************************************
 *   Copyright (C) 2004-2009 by Pere Constans
 *   constans@molspaces.com
 *   cb2Bib version 1.3.0. Licensed under the GNU GPL version 3.
 *   See the LICENSE file that comes with this distribution.
 ***************************************************************************/
#include "metadataParser.h"

#include "coreBibParser.h"
#include "settings.h"

#include <QProcess>
#include <QXmlStreamReader>


/** \page metadata Reading and writing bibliographic metadata


    \section metadata_read Reading metadata

    Metadata in scientific documents is, unfortunately, rarely appreciated and
    not widely used. When it comes to bibliographic metadata, the situation is
    even quite deceiving: there is no accepted format specification, and the
    reliability of publishers' metadata, if any at all, is questionable in many
    cases.

    The cb2bib reads all XMP (a specific XML standard devised for metadata
    storage) packets found in the document. It then parses the XML strings
    looking for nodes and attributes with key names meaningful to bibliographic
    references. If a given bibliographic field is found in multiple packets,
    the cb2Bib will take the last one, which most often, and according to the
    PDF specs, is the most updated one. The fields <tt>file</tt>, which would
    be the document itself, and <tt>pages</tt>, which is usually the actual
    number of pages, are skipped.

    The metadata is then summarized in the cb2Bib clipboard panel as, for
    instance

\code
[Bibliographic Metadata
<title>arXiv:0705.0751v1  [cs.IR]  5 May 2007</title>
/Bibliographic Metadata]
\endcode

    This data, whenever the user considers it to be correct, can be easily
    imported by the build-in 'Heuristic Guess' capability. On the other hand,
    if keys are found with the prefix <tt>bibtex</tt>, the cb2Bib will assume
    the document does contain bibliographic metadata, and it will only consider
    the keys having this prefix. Assuming therefore that metadata is
    bibliographic, the cb2Bib will automatically import the reference. This
    way, if using PDFImport, BibTeX-aware documents will be processed as
    successfully recognized, without requiring any user supplied regular
    expression.

    See also \ref relnotes100, \ref c2bconf_clipboard, and \ref c2bpdfimport.
    <p>&nbsp;</p>


    \section metadata_write Writing metadata

    Once an extracted reference is saved and there is a document attached to
    it, the cb2Bib will optionally insert the bibliographic metadata into the
    document itself. The cb2Bib writes an XMP packet as, for instance,

\code
<bibtex:author>P. Constans</bibtex:author>
<bibtex:journal>arXiv 0705.0751</bibtex:journal>
<bibtex:title>Approximate textual retrieval</bibtex:title>
<bibtex:type>article</bibtex:type>
<bibtex:year>2007</bibtex:year>
\endcode

    which is similar to
    \htmlonly
    <a href="http://jabref.sourceforge.net/help/XMPHelp.php" target="_blank">JabRef</a>,
    \endhtmlonly
    but differs on that the cb2Bib strictly sticks to BibTeX and avoids
    (perhaps unnecessary) syntax specialization in author strings.

    The BibTeX fields <tt>file</tt> and <tt>id</tt> are skip from writing. The
    former for the reason mentioned above, and the latter because it is easily
    generated by specialized BibTeX software according to each user preferences.
    LaTeX escaped characters for non Ascii letters are converted to Unicode, as
    XMP already specifies this codec.

    The actual writing of the packet into the document is performed by
    ExifTool, an excellent Perl program written by Phil Harvey. See
    \htmlonly
    <a href="http://www.sno.phy.queensu.ca/~phil/exiftool/" target="_blank">http://www.sno.phy.queensu.ca/~phil/exiftool/</a>.
    \endhtmlonly
    ExifTool supports several document formats for writing. The most relevant
    here are Postscript and PDF. For PDF documents, metadata is written as an
    incremental update of the document. This exactly preserves the binary
    structure of the document, and changes can be easily reversed or modified
    if so desired. Whenever ExifTool is unable to insert metadata, e.g.,
    because the document format is not supported or it has structural errors,
    the cb2Bib will issue an information message, and the document will remain
    untouched.


    See also \ref c2bconf_documents and \ref update_metadata.

*/
metadataParser::metadataParser(QObject* parento) : QObject(parento)
{
    _cbpP = new coreBibParser(this);
    init();
}

metadataParser::metadataParser(coreBibParser* cbp, QObject* parento) : QObject(parento), _cbpP(cbp)
{
    Q_ASSERT_X(_cbpP, "metadataParser", "coreBibParser was not instantiated");
    init();
}

metadataParser::~metadataParser()
{}


void metadataParser::init()
{
    _settingsP = settings::instance();
    // Set bibliographic fields
    // Remove fields file (it is itself) and pages (usually number of pages) from list
    _fields = QRegExp("\\b(?:abstract|address|annote|author|booktitle|chapter|"
                      "doi|edition|editor|eprint|institution|isbn|issn|journal|"
                      "keyword|keywords|key words|month|note|number|organization|"
                      "pages|publisher|school|series|title|url|volume|year)\\b");
    _fields.setCaseSensitivity(Qt::CaseInsensitive);
    // Recognition from BibTeX entries
    _bibtex_fields = QRegExp("\\bbibtex:(?:abstract|address|annote|author|booktitle|chapter|"
                             "doi|edition|editor|eprint|institution|isbn|issn|journal|"
                             "keywords|month|note|number|organization|pages|publisher|"
                             "school|series|title|url|volume|year)\\b");
    _bibtex_fields.setCaseSensitivity(Qt::CaseInsensitive);
}

const QString metadataParser::metadata(const QString& fn)
{
    if (!_metadata(fn))
        return QString();
    QString data;
    if (_has_bibtex)
        data = _cbpP->referenceToBibTeX(_ref);
    else
    {
        const QStringList& bibliographicFields = _cbpP->bibliographicFields();
        if (_ref.contains("type"))
            data += QString("<%1>%2</%1>\n").arg("type").arg(_ref.value("type"));
        for (int i = 0; i < bibliographicFields.count(); ++i)
        {
            QString key = bibliographicFields.at(i);
            if (_ref.contains(key))
                data += QString("<%1>%2</%1>\n").arg(key).arg(_ref.value(key));
        }
    }
    data = QString("[Bibliographic Metadata\n%1/Bibliographic Metadata]\n").arg(data);
    return data;
}

bool metadataParser::metadata(const QString& fn, bibReference* ref)
{
    ref->clearReference();
    bool has_reference = _metadata(fn);
    has_reference = has_reference && _has_bibtex && _has_cb2bib;
    if (has_reference)
        (*ref) = _ref;
    return has_reference;
}

bool metadataParser::_metadata(const QString& fn)
{
    QByteArray raw_contents;
    QFile f(fn);
    if (f.open(QIODevice::ReadOnly))
    {
        raw_contents = f.readAll();
        f.close();
    }
    else
        return false;

    _ref.clearReference();
    _ref.typeName = "article";
    _has_bibtex = false;
    _has_cb2bib = false;

    QStringList xmls;
    _metadataXmp(fn, raw_contents, &xmls);
    // Last in list should be the most updated, parse it last
    for (int i = 0; i < xmls.count(); ++i)
        _fuzzyParser(xmls.at(i));
    QMutableHashIterator<QString, QString> it(_ref);
    while (it.hasNext())
    {
        it.next();
        it.value() = c2bUtils::fromQtXmlString(it.value());
    }
    if (!_has_cb2bib)
        _miscellaneousData(fn, raw_contents);
    if (_ref.count() == 0)
        return false;

    if (_has_bibtex)
        if (_ref.contains("type"))
            _ref.typeName = _ref.value("type");

    return true;
}

void metadataParser::_metadataXmp(const QString& fn, const QByteArray& raw_contents, QStringList* xmls)
{
    xmls->clear();
    int pos = 0;
    while (pos > -1)
    {
        // Scan all packets, and do not trust "=''  " etc, as producers encode differently
        pos = raw_contents.indexOf("<?xpacket begin", pos);
        if (pos > -1)
        {
            int posn = raw_contents.indexOf("<?xpacket end", pos);
            if (posn > pos)
            {
                xmls->append(c2bUtils::toQtXmlString(QString::fromUtf8(raw_contents.mid(pos, posn - pos + 19))));
                _has_bibtex = _has_bibtex || xmls->last().contains("bibtex:");
                _has_cb2bib = _has_cb2bib || xmls->last().contains("http://www.molspaces.com/cb2bib");
                pos = posn;
            }
            else
                pos = -1;
        }
    }
    if (xmls->count() == 0)
        _metadataXmpExifTool(fn, xmls);
}

void metadataParser::_miscellaneousData(const QString& fn, const QByteArray& raw_contents)
{
    // Get title and author from here whenever no cb2Bib BibTeX data is available
    QString data;
    QRegExp pdf_author_rx;
    QRegExp pdf_title_rx;
    QString exiftool_bin = _settingsP->fileName("cb2Bib/ExifToolBin");
    if (QFileInfo(exiftool_bin).exists())
    {
        QProcess exiftool;
        QStringList arglist;
        arglist.append(fn);
        exiftool.start(exiftool_bin, arglist);
        if (!exiftool.waitForFinished(90000))
            exiftool.kill();
        data = QString::fromUtf8(exiftool.readAllStandardOutput());
        pdf_author_rx.setPattern("Author\\s*:\\s+(.*)\\n");
        pdf_title_rx.setPattern("Title\\s*:\\s+(.*)\\n");
    }
    else
    {
        if (!raw_contents.startsWith("%PDF"))
            return;
        data = _pdfDictionary(raw_contents);
        pdf_author_rx.setPattern("\\Author\\s*\\((.*)\\)");
        pdf_title_rx.setPattern("\\Title\\s*\\((.*)\\)");
    }

    pdf_author_rx.setMinimal(true);
    pdf_author_rx.setCaseSensitivity(Qt::CaseSensitive);
    if (pdf_author_rx.indexIn(data) > -1)
        if (!pdf_author_rx.cap(1).isEmpty())
            _ref["author"] = pdf_author_rx.cap(1);
    // Done if BibTeX, otherwise try checking dictionary for title
    if (_has_bibtex)
        return;
    pdf_title_rx.setMinimal(true);
    pdf_title_rx.setCaseSensitivity(Qt::CaseSensitive);
    if (pdf_title_rx.indexIn(data) > -1)
        if (!pdf_title_rx.cap(1).isEmpty())
            _ref["title"] = pdf_title_rx.cap(1);
}

const QString metadataParser::_pdfDictionary(const QByteArray& rawpdf)
{
    // Heuristic to locate the Pdf dictionary
    int pos = rawpdf.lastIndexOf("/Producer");
    if (pos > -1)
    {
        int pos0 = rawpdf.lastIndexOf("<<", pos);
        if (pos0 > -1)
        {
            int posn = rawpdf.indexOf(">>", pos);
            if (posn > pos0)
                return QString::fromLatin1(rawpdf.mid(pos0, posn - pos0 + 2));
        }
    }
    return QString();
}

void metadataParser::_metadataXmpExifTool(const QString& fn, QStringList* xmls)
{
    // Not actually needed, called for not directly visible XMP packages
    // It's slower than _metadataXmp() full scan.
    QString exiftool_bin = _settingsP->fileName("cb2Bib/ExifToolBin");
    if (!QFileInfo(exiftool_bin).exists())
        return;
    QProcess exiftool;
    QStringList arglist;
    arglist.append("-xmp");
    arglist.append("-b");
    arglist.append(fn);
    exiftool.start(exiftool_bin, arglist);
    if (!exiftool.waitForFinished(90000))
        exiftool.kill();
    QString xmp = c2bUtils::toQtXmlString(QString::fromUtf8(exiftool.readAllStandardOutput()));
    if (xmp.startsWith("<?xpacket begin"))
    {
        xmls->append(xmp);
        _has_bibtex = _has_bibtex || xmls->last().contains("bibtex:");
        _has_cb2bib = _has_cb2bib || xmls->last().contains("http://www.molspaces.com/cb2bib");
    }
}

void metadataParser::_fuzzyParser(const QString& data)
{
    if (data.isEmpty())
        return;
    QXmlStreamReader parser;
    parser.addData(data);
    QRegExp* fields;
    if (_has_bibtex)
        fields = &_bibtex_fields;
    else
        fields = &_fields;
    QString field;
    QString key;
    QString value;
    while (!parser.atEnd())
    {
        parser.readNext();
        if (parser.isStartElement())
        {
            // Do attributes (seems poppler xml composing)
            QXmlStreamAttributes att = parser.attributes();
            for (int i = 0; i < att.count(); ++i)
            {
                field = att.at(i).qualifiedName().toString();
                key = att.at(i).name().toString().toLower();
                value = att.at(i).value().toString();
                if (value.isEmpty())
                    continue;
                if (field.contains(*fields))
                    _ref[key] = value;
                else if (QString::compare(field, "authors", Qt::CaseInsensitive) == 0)
                    _ref["author"] = value;
                else if (QString::compare(field, "summary", Qt::CaseInsensitive) == 0 ||
                         QString::compare(field, "subject", Qt::CaseInsensitive) == 0)
                {
                    if (!_ref.contains(field))  // Prefer BibTeX field name than synonyms
                        _ref["abstract"] = value;
                }
                else if (QString::compare(field, "bibtex:type", Qt::CaseInsensitive) == 0 ||
                         QString::compare(field, "bibtex:entrytype", Qt::CaseInsensitive) == 0)
                    _ref["type"] = value.toLower();
            }

            // Do element (exiftool and exempi xml composing)
            field = parser.qualifiedName().toString();
            key = parser.name().toString().toLower();
            if (field.contains(*fields))
            {
                parser.readNext();
                value = parser.text().toString().trimmed();
                if (!value.isEmpty())
                    _ref[key] = value;
            }
            else if (QString::compare(field, "authors", Qt::CaseInsensitive) == 0)
            {
                parser.readNext();
                value = parser.text().toString().trimmed();
                if (!value.isEmpty())
                    _ref["author"] = value;
            }
            else if (QString::compare(field, "summary", Qt::CaseInsensitive) == 0 ||
                     QString::compare(field, "subject", Qt::CaseInsensitive) == 0)
            {
                parser.readNext();
                value = parser.text().toString().trimmed();
                if (!value.isEmpty() && !_ref.contains(field))  // Prefer BibTeX field name than synonyms
                    _ref["abstract"] = value;
            }
            else if (QString::compare(field, "bibtex:type", Qt::CaseInsensitive) == 0 ||
                     QString::compare(field, "bibtex:entrytype", Qt::CaseInsensitive) == 0)
            {
                parser.readNext();
                value = parser.text().toString().trimmed();
                if (!value.isEmpty())
                    _ref["type"] = value.toLower();
            }
        }
    }
    if (parser.hasError())
        c2bUtils::debug(tr("metadataParser: Error while parsing XML packets"));
}

bool metadataParser::insertMetadata(const bibReference& ref, const QString& fn, QString* error) const
{
    if (error)
        error->clear();
    QString exiftool_bin = _settingsP->fileName("cb2Bib/ExifToolBin");
    if (exiftool_bin.isEmpty())
    {
        if (error) *error = tr("Metadata writer: ExifTool location has not been specified.");
        emit showMessage(tr("Warning - cb2Bib"),
                         tr("Metadata writer: ExifTool location has not been specified."));
        return false;
    }
    if (!QFileInfo(exiftool_bin).exists())
    {
        if (error) *error = tr("Metadata writer: ExifTool file %1 does not exist.").arg(exiftool_bin);
        emit showMessage(tr("Warning - cb2Bib"),
                         tr("Metadata writer: ExifTool file %1 does not exist.").arg(exiftool_bin));
        return false;
    }
    if (ref.count() == 0)
        return false;

    QString bibtags;
    QString key;
    QString value;
    QString entry("<bibtex:%1>%2</bibtex:%1>\n");
    bibtags += entry.arg("type").arg(ref.typeName);
    const QStringList& bibliographicFields = _cbpP->bibliographicFields();
    for (int i = 0; i < bibliographicFields.count(); ++i)
    {
        key = bibliographicFields.at(i);
        value = ref.value(key);
        if (value.isEmpty())
            continue;
        if (key == "file")
            continue;
        else if (key == "id")
            continue;
        c2bUtils::fullBibToC2b(value);
        if (key == "title")
            c2bUtils::cleanTitle(value);
        else if (key == "booktitle")
            c2bUtils::cleanTitle(value);
        bibtags += entry.arg(key).arg(value);
    }
    QString bibtags_xmp = c2bUtils::fileToString(":/xml/xml/cb2bib.xmp");
    bibtags_xmp.replace("GET_BIBTEX_TAGS", bibtags);
    bibtags_xmp.replace("GET_FORMATTED_AUTHOR", formattedAuthor(ref.value("author")));

    QString workdir = QFileInfo(fn).absolutePath();
    QString bibtags_file = workdir + "/bibtags.xmp";
    c2bUtils::stringToFile(bibtags_xmp, bibtags_file);

    QProcess exiftool;
    QStringList arglist;
    arglist.append("-overwrite_original");
    arglist.append("-m");
    arglist.append("-TagsFromFile");
    arglist.append(bibtags_file);
    arglist.append("-all:all");
    arglist.append("-pdf:all<all");
    arglist.append("-postscript:all<all");
    arglist.append(fn);

    QStringList envlist = QProcess::systemEnvironment();
    envlist.prepend("EXIFTOOL_HOME=" + workdir);
    exiftool.setEnvironment(envlist);
    QString exiftoolconf_file = workdir + "/.ExifTool_config";
    c2bUtils::stringToFile(c2bUtils::fileToString(":/xml/xml/ExifTool_config"), exiftoolconf_file);

    exiftool.start(exiftool_bin, arglist);
    if (!exiftool.waitForFinished(90000))
        exiftool.kill();
    QString exiftool_error = exiftool.readAllStandardError().trimmed();
    bool inserted = exiftool.exitCode() == 0 && exiftool_error.isEmpty();
    if (!inserted && error)
        *error = exiftool_error;
    QFile::remove(bibtags_file);
    QFile::remove(exiftoolconf_file);
    return inserted;
}

Generated by  Doxygen 1.6.0   Back to index