Logo Search packages:      
Sourcecode: cb2bib version File versions

authorString.h

/***************************************************************************
 *   Copyright (C) 2004-2009 by Pere Constans
 *   constans@molspaces.com
 *   cb2Bib version 1.3.0. Licensed under the GNU GPL version 3.
 *   See the LICENSE file that comes with this distribution.
 ***************************************************************************/
#ifndef AUTHORSTRING_H
#define AUTHORSTRING_H

#include "cb2bib_utilities.h"

#include <QRegExp>
#include <QString>
#include <QStringList>


namespace author
{

static const QString double_initials("(Al|Kh|Th|Ya|Yu|Zs)");
static const QString prefixes("(da|de|dal|del|der|di|do|du|dos|la|le|lo|van|vande|von|zur)");

static const QString name("(?:\\w[-'\\w]{1,})");
static const QString initials("(?:\\s*-{0,1}\\b\\w\\b\\.{0,1}){1,3}");
static const QString reversed_romance_name("(?:\\w[-'\\w]{1,})\\s+(?:\\w[-'\\w]{1,}),\\s*(?:\\w[-'\\w]{1,}|" + initials + ")");

static const QRegExp reversed_name_rx("^" + name + ",");
// Cases 'n1 n2, n3', 'n1 n2, n3 and n4 n5, n6', 'n1 n2, n3 and n4, n5 n6' are necessarily reverse order
static const QRegExp reversed_romance_name_rx("^(?:" + reversed_romance_name + "|"
        + reversed_romance_name + " and " + reversed_romance_name + "|"
        + reversed_romance_name + " and (?:\\w[-'\\w]{1,}),\\s*(?:\\w[-'\\w]{1,}|\\w[-'\\w]{1,} \\w[-'\\w]{1,}|" + initials + "))$");

// simplifyString
static const QRegExp simplify_string_rx1(QString("%1(?=\\w)").arg(QChar(8217)));
static const QRegExp simplify_string_rx2(QRegExp("\\d\\d+"));
static const QRegExp simplify_string_rx3(QRegExp("\\d(?=\\s\\w\\w)"));
static const QRegExp simplify_string_rx4(QRegExp("\\d[\\*,;][a-z]\\b"));
static const QRegExp simplify_string_rx5(QRegExp("\\d"));
static const QRegExp simplify_string_rx6(QRegExp("[^-',;:\\|/&\\.\\s\\w]"));
inline QString& simplifyString(QString& author, const bool full = false)
{
    if (full) // Characters | and : are used for the encoder
    {
        author.replace('|', ' ');
        author.replace(':', ' ');
    }
    author.replace(simplify_string_rx1, "\'"); // Normalize apostrophe
    author.replace(simplify_string_rx2, "/"); // Break dates, addresses, etc, but remove from author's foot notes.
    author.replace(simplify_string_rx3, ","); // Help no-separator designs, and also break zip codes.
    author.replace(simplify_string_rx4, " ");
    author.remove(simplify_string_rx5); // Better remove if no conflict. It will help to not confuse with chemical formula.
    author.replace(simplify_string_rx6, " ");
    author = author.simplified();
    return author;
}

extern QString fromMedline(const QString& author);

struct unifier
{
    unifier();
    QString& unifyNames(QString& author);
    QRegExp unifier_rx1;
    QRegExp unifier_rx2;
    QRegExp unifier_rx3;
    QRegExp unifier_rx4;
    QRegExp unifier_rx5;
    QRegExp unifier_rx6;
    QRegExp unifier_rx7;
};

/**
    Implementation of author field extraction
    P. Constans. A Simple Extraction Procedure for Bibliographical Author Field.
    arXiv:0902.0755v1, 2009.
*/
00077 struct encoder
{
    encoder() {}
    encoder(const QString& str)
    {
        encode(str);
    }
    QString code;
    QString decoded(const int position, const int length) const;
    QStringList fragments;
    unifier as_unifier;
    void clear()
    {
        code.clear();
        fragments.clear();
    }
    void encode(const QString& raw);
    void scape(const int pos, const int length, const QChar c = ' ')
    {
        const int pos0 = qMax(pos, 0);
        const int posn = qMin(pos0 + length, code.length());
        for (int i = pos0; i < posn; ++i)
            if (code.at(i) != 'L')
                code[i] = c;
    }
    void scapePattern(const QString pattern, const QChar c = ' ')
    {
        QRegExp rx(pattern, Qt::CaseSensitive);
        rx.setMinimal(true);
        rx.setPatternSyntax(QRegExp::RegExp2);
        if (!rx.isValid())
            exit(6);
        int pos = 0;
        while (pos > -1)
        {
            pos = rx.indexIn(code, pos);
            if (pos > -1)
            {
                scape(pos, rx.matchedLength(), c);
                pos  += rx.matchedLength();
            }
        }
    }
    inline bool hasUpper(const QString& str) const
    {
        for (int i = 0; i < str.length(); i++)
            if (str.at(i).isLetter())
                if (str.at(i).category() == QChar::Letter_Uppercase)
                    return true;
        return false;
    }
    inline bool isCapitalName(const QString& w) const
    {
        return c2bUtils::isUpperCaseString(w, qMax(0, w.lastIndexOf('_', -5))); // Skip prefixes and suffixes
    }
    inline bool isSeparator(const QString& w) const
    {
        return (w == "and") || (w == "&");
    }
    bool isAdparticle(const QString& w) const;
    bool isInitial(const QString& w) const;
    bool isName(const QString& w) const;
    bool isPlainWord(const QString& w) const;
};

} // namespace author


class authorString
{

public:
    authorString();
    ~authorString();

    QString toBibTeX(const QString& author, bool full_form = false);


private:
    QString _author_string;
    QString capitalize(const QString& name) const;
    QString processFirstMiddle(const QString& first_middle) const;
    author::unifier as_unifier;
    bool _full_form;
    bool containLowerCaseLetter(const QString& author) const;
    bool containUpperCaseLetter(const QString& author) const;
    bool isReverseOrder(const QString& author) const;

};

#endif

Generated by  Doxygen 1.6.0   Back to index