"Fossies" - the Fresh Open Source Software Archive 
Member "cb2bib-2.0.1/src/c2b/authorString.cpp" (12 Feb 2021, 26152 Bytes) of package /linux/privat/cb2bib-2.0.1.tar.gz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style:
standard) with prefixed line numbers and
code folding option.
Alternatively you can here
view or
download the uninterpreted source code file.
For more information about "authorString.cpp" see the
Fossies "Dox" file reference documentation and the latest
Fossies "Diffs" side-by-side code changes report:
2.0.0_vs_2.0.1.
1 /***************************************************************************
2 * Copyright (C) 2004-2021 by Pere Constans
3 * constans@molspaces.com
4 * cb2Bib version 2.0.1. Licensed under the GNU GPL version 3.
5 * See the LICENSE file that comes with this distribution.
6 ***************************************************************************/
7 #include "authorString.h"
8
9
10 namespace author
11 {
12
13 unifier::unifier()
14 : name("(?:\\w[-'\\w]{1,})"),
15 initials("(?:\\s*-{0,1}\\b\\w\\b\\.{0,1}){1,3}"),
16 double_initials("(Al|Ch|Kh|Md|Th|Xh|Ya|Yu|Zs)"),
17 prefixes("(da|de|dal|del|der|di|do|du|dos|el|la|le|lo|van|vande|von|zur)"),
18 reversed_romance_name("(?:\\w[-'\\w]{1,})\\s+(?:\\w[-'\\w]{1,}),\\s*(?:\\w[-'\\w]{1,}|" + initials + ')'),
19
20 reversed_name_rx('^' + name + ','),
21 // Cases 'n1 n2, n3', 'n1 n2, n3 and n4 n5, n6', 'n1 n2, n3 and n4, n5 n6' are necessarily reverse order
22 reversed_romance_name_rx("^(?:" + reversed_romance_name + '|' + reversed_romance_name + " and " +
23 reversed_romance_name + '|' + reversed_romance_name +
24 " and (?:\\w[-'\\w]{1,}),\\s*(?:\\w[-'\\w]{1,}|\\w[-'\\w]{1,} \\w[-'\\w]{1,}|" + initials +
25 "))$"),
26
27 unifier_rx1("(\\w\\w)\\si\\s(\\w\\w)(?!d\\b)", Qt::CaseSensitive),
28 unifier_rx2("\\b" + prefixes + "\\s(?!(?:,|and\\b))", Qt::CaseInsensitive),
29 unifier_rx3("\\b" + double_initials + "\\.", Qt::CaseInsensitive),
30 unifier_rx4("\\b(\\w[-'\\w]{2,})\\W+Jr\\.", Qt::CaseInsensitive),
31 unifier_rx5("\\b(\\w[-'\\w]{2,})\\W+Jr\\b", Qt::CaseInsensitive),
32 unifier_rx6("(\\w),{0,1}\\s(II|III|IV)\\b", Qt::CaseSensitive),
33 unifier_rx7("([^\\w-])[a-z](?=[^\\w'])"),
34
35 simplify_string_rx1(QString("%1(?=\\w)").arg(QChar(8217))),
36 simplify_string_rx2("\'(?!\\w)"),
37 simplify_string_rx3("\\d\\d+"),
38 simplify_string_rx4("\\d(?=\\s\\w\\w)"),
39 simplify_string_rx5("\\d[\\*,;][a-z]\\b"),
40 simplify_string_rx6("\\d"),
41 simplify_string_rx7("[^-',;:\\|/&\\.\\s\\w]") {}
42
43
44 QString& unifier::unifyNames(QString& author) const
45 {
46 // Composite Names temporary unified
47 author.replace(unifier_rx1, "\\1+i+\\2");
48 author.replace(unifier_rx2, "\\1+");
49 author.replace("Da+", "Da ", Qt::CaseSensitive);
50 author.replace(unifier_rx3, "\\1+ ");
51 if (author.contains("Jr", Qt::CaseInsensitive))
52 {
53 // Remove period and first comma if there
54 author.replace(unifier_rx4, "\\1+JR");
55 author.replace(unifier_rx5, "\\1+JR");
56 }
57 if (author.contains('I', Qt::CaseSensitive))
58 author.replace(unifier_rx6, "\\1+\\2");
59 author.replace('+', '_');
60 author.replace(unifier_rx7, "\\1 "); // Cleaning affiliation 'superscripts'. Avoid cleaning 'M.-m. Lin'
61 return author;
62 }
63
64 QString& unifier::simplifyString(QString& author, const bool full) const
65 {
66 if (full) // Characters | and : are used for the encoder
67 {
68 author.replace('|', ' ');
69 author.replace(':', ' ');
70 }
71 author.replace(simplify_string_rx1, "\'"); // Normalize apostrophe
72 author.remove(simplify_string_rx2); // Remove spurious apostrophes
73 author.replace(simplify_string_rx3, "/"); // Break dates, addresses, etc, but remove from author's foot notes.
74 author.replace(simplify_string_rx4, ","); // Help no-separator designs, and also break zip codes.
75 author.replace(simplify_string_rx5, " ");
76 author.remove(
77 simplify_string_rx6); // Better remove if no conflict. It will help to not confuse with chemical formula.
78 author.replace(simplify_string_rx7, " ");
79 author = c2bUtils::simplifyString(author);
80 return author;
81 }
82
83 QString unifier::fromMedline(const QString& author) const
84 {
85 // Preprocess Author from Medline 'AAAAAAA BB' to Aaaaaaa, BB'
86 // which can be unambiguously translated to 'B. B. Aaaaaaa'
87 // Takes care of cb2Bib included prefixes and suffixes
88 // FAU - Foa, Edna B
89 // AU - Foa EB
90 // FAU - Steketee, Gail S
91 // AU - Steketee GS
92
93 QString FullN(author.simplified());
94 FullN.replace(QRegExp("\\b" + prefixes + "\\s", Qt::CaseInsensitive), "\\1+");
95 FullN.replace('+', '_');
96 QStringList parts;
97 QString LastN;
98 if (FullN.contains(',')) // Some FAU are 'Last1 Last2, First'
99 {
100 parts = FullN.split(',', QString::SkipEmptyParts);
101 if (parts.count() > 1)
102 LastN = parts.takeFirst();
103 }
104 else
105 {
106 parts = FullN.split(' ', QString::SkipEmptyParts);
107 if (parts.count() > 1)
108 LastN = parts.takeFirst();
109 }
110 FullN = parts.join(" ");
111 parts = FullN.split(' ', QString::SkipEmptyParts);
112 if (!LastN.isEmpty())
113 if (c2bUtils::isUpperCaseString(LastN))
114 {
115 LastN = LastN.toLower();
116 LastN[0] = LastN.at(0).toUpper();
117 int ii(LastN.indexOf(QRegExp("[-']")));
118 if (ii++ > 0)
119 LastN[ii] = LastN.at(ii).toUpper();
120 }
121 QString FirstN;
122 for (int i = 0; i < parts.count(); ++i)
123 FirstN += ' ' + parts.at(i);
124 QString isSuffix;
125 if (parts.count() > 0)
126 isSuffix = parts.last();
127 if (isSuffix.contains(QRegExp("\\b(?:2nd|3rd|Jr|II|III)\\b")))
128 {
129 isSuffix.replace(QRegExp("\\b2nd\\b"), "II");
130 isSuffix.replace(QRegExp("\\b3rd\\b"), "III");
131 LastN += ' ' + isSuffix;
132 FirstN.remove(QRegExp("\\b(?:2nd|3rd|Jr|II|III)\\b"));
133 }
134 LastN.replace(QRegExp(prefixes + '_', Qt::CaseInsensitive), "\\1 ");
135 FullN = LastN + ',' + FirstN;
136 return FullN;
137 }
138
139
140 /**
141 Implementation of author field extraction
142 P. Constans. A Simple Extraction Procedure for Bibliographical Author Field.
143 arXiv:0902.0755, 2009.
144 */
145 void encoder::encode(const QString& raw)
146 {
147 clear();
148 QString str(raw);
149 aunifier.unifyNames(str);
150 int position(0);
151 int length(0);
152 for (int i = 0; i < str.length(); ++i)
153 {
154 const QChar& si = str[i];
155 if (si.isLetter())
156 ++length;
157 else if (si == '_')
158 ++length;
159 else if (si == '-')
160 ++length;
161 else if (si == '\'')
162 ++length;
163 else
164 {
165 if (length > 0)
166 fragments.append(str.mid(position, length));
167 position = i + 1;
168 length = 0;
169 if (si != ' ')
170 fragments.append(str.at(i));
171 }
172 }
173 if (length > 0)
174 fragments.append(str.mid(position, length));
175 for (int i = 0; i < fragments.count(); ++i)
176 {
177 const QString& w = fragments.at(i);
178 if (isSeparator(w))
179 code += '&';
180 else if (isAdparticle(w))
181 code += 'a';
182 else if (isInitial(w))
183 code += 'I';
184 else if (isPlainWord(w))
185 code += 'w';
186 else if (isName(w))
187 {
188 if (isCapitalName(w))
189 code += 'N';
190 else
191 code += 'n';
192 }
193 else if (w.at(0) == '.')
194 code += 'p';
195 else if (w.at(0) == ',')
196 code += ',';
197 else if (w.at(0) == ';')
198 code += ';';
199 else if (w.at(0) == ':')
200 code += ':';
201 else if (w.at(0) == '|')
202 code += 'L';
203 else
204 code += 'o';
205 }
206 scapePattern("aL+[nN]{1,2}");
207 scapePattern("a[nNw]&L+[nN]{1,2}"); // in Linear and / Sublinear Time
208 scapePattern(":L+[InN]{1,2}"); // ... Structure Classification: / A Survey
209 scapePattern("[nN]*&L[nN]L"); // Not an & for author
210 }
211
212 QString encoder::decoded(const int position, const int length) const
213 {
214 if (position < 0)
215 return QString();
216 if (length < 1 || position + length > fragments.count())
217 return QString();
218 QString d(fragments.at(position));
219 for (int i = 1; i < length; ++i)
220 d += ' ' + fragments.at(position + i);
221 // Above extra spaces are fine, except in these cases
222 d.replace(" . -", ".-");
223 d.replace(" ,", ",");
224 return d;
225 }
226
227 bool encoder::isPlainWord(const QString& w)
228 {
229 if (w.length() > 1)
230 {
231 if (w.contains('_') || w.contains('-'))
232 return c2bUtils::isLowerCaseString(w);
233 if (w.at(0).isLetter())
234 if (w.at(0).category() == QChar::Letter_Lowercase)
235 return true;
236 }
237 return false;
238 }
239
240 bool encoder::isInitial(const QString& w)
241 {
242 if (w.length() == 1)
243 if (w.at(0).isLetter())
244 return w.at(0).isUpper();
245 if (w.length() == 2)
246 if (w.at(0) == '-')
247 if (w.at(1).isLetter())
248 return true; // Chinese composite might(?) be lower
249 return false;
250 }
251
252 bool encoder::isName(const QString& w)
253 {
254 if (w.length() < 2)
255 return false;
256 if (w.at(0).isUpper())
257 return true;
258 if (w.contains('_'))
259 return hasUpper(w);
260 return false;
261 }
262
263 bool encoder::isAdparticle(const QString& w)
264 {
265 const int ws(w.size());
266 if (ws < 2)
267 return false;
268 if (ws > 6)
269 return false;
270 const QByteArray ba(w.toLatin1());
271 const char* s = ba.data();
272 const int ss(ws * int(sizeof(char)));
273 if (ws == 2)
274 {
275 if (memcmp("of", s, ss) == 0)
276 return true;
277 if (memcmp("on", s, ss) == 0)
278 return true;
279 if (memcmp("to", s, ss) == 0)
280 return true;
281 if (memcmp("in", s, ss) == 0)
282 return true;
283 if (memcmp("as", s, ss) == 0)
284 return true;
285 if (memcmp("vs", s, ss) == 0)
286 return true;
287 if (memcmp("at", s, ss) == 0)
288 return true;
289 if (memcmp("is", s, ss) == 0)
290 return true;
291 if (memcmp("an", s, ss) == 0)
292 return true;
293 }
294 if (ws == 3)
295 {
296 if (memcmp("for", s, ss) == 0)
297 return true;
298 if (memcmp("but", s, ss) == 0)
299 return true;
300 if (memcmp("are", s, ss) == 0)
301 return true;
302 if (memcmp("its", s, ss) == 0)
303 return true;
304 if (memcmp("the", s, ss) == 0)
305 return true;
306 }
307 if (ws == 4)
308 {
309 if (memcmp("from", s, ss) == 0)
310 return true;
311 if (memcmp("with", s, ss) == 0)
312 return true;
313 if (memcmp("into", s, ss) == 0)
314 return true;
315 }
316 if (ws == 6)
317 {
318 if (memcmp("within", s, ss) == 0)
319 return true;
320 }
321 return false;
322 }
323
324 } // namespace author
325
326
327 /** \page authorproc Processing of Author Names
328
329 cb2Bib automatically processes the author names string. It uses a set of
330 heuristic rules. First, the authors separator is identified. And second, it
331 is decided whether or not author names are in natural or reverse order, or
332 in the 'Abcd, E., F. Ghij, ...' mixed order.
333
334 */
335 authorString::authorString() : _full_form(false) {}
336
337
338 /** \page authorproc
339
340 Cleanup author string:
341
342 - Escape BibTeX to Unicode
343
344 - Remove digits from authors string
345
346 - Remove any character except <tt>-',;&\\.\\s\\w</tt>
347
348 - Simplify white spaces
349
350 - Consider composing prefixes <tt>(da|de|dal|del|der|di|do|du|dos|el|la|le|lo|van|vande|von|zur)</tt>
351
352 - Consider composing suffixes <tt>(II|III|IV|Jr)</tt>
353
354 - Some publishers use superscripts to refer to multiple author affiliations.
355 Text clipboard copying loses superscript formatting. Author strings are
356 clean from 'orphan' lowcase, single letters in a preprocessing step.
357 Everything following the pattern <b>[a-z]</b> is removed. Fortunately,
358 abbreviated initials are most normally input as uppercase letters, thus
359 permitting a correct superscript clean up. \n <em>Caution:</em> Lowcase,
360 single, a to z letters are removed from author's string.\n <em>Caution:</em>
361 Supperscripts <b>will be added to author Last Name</b> if no separation is
362 provided. Users should care about it and correct these cases.
363
364
365 Rules to identify separators:
366 - Contains comma and semicolon -> ';'
367 - Contains pattern <tt>'^Abcd, E.-F.,'</tt> -> ','
368 - Contains pattern <tt>'^Abcd,'</tt> -> 'and'
369 - Contains comma -> ','
370 - Contains semicolon -> ';'
371 - Any other -> 'and'
372
373 */
374 QString authorString::toBibTeX(const QString& author, bool full_form)
375 {
376 _full_form = full_form;
377 _author_string = author;
378 // BibTeX braces interfere with authorString, remove them even though some BibTeX meaning might be lost
379 _author_string.remove('{');
380 _author_string.remove('}');
381 au.simplifyString(_author_string, true);
382 au.unifyNames(_author_string);
383 const bool has_comma(_author_string.contains(','));
384 const bool has_semicolon(_author_string.contains(';'));
385 const bool has_ands(_author_string.count(" and ") > 1);
386 const bool is_first_reversed(_author_string.contains(au.reversed_name_rx));
387 const bool is_special_case(_author_string.contains(au.reversed_romance_name_rx));
388 bool is_string_reversed((has_comma && has_semicolon) || (has_comma && has_ands) || is_special_case);
389
390 QString separator;
391 if (is_special_case)
392 separator = " and ";
393 else if (has_comma && has_semicolon)
394 separator = ';'; // Multiple Authors, separated by semicolon, reversed naming
395 else if (has_comma)
396 {
397 if (is_first_reversed)
398 {
399 if (_author_string.contains(QRegExp('^' + au.name + ",(?:\\s*-{0,1}\\b\\w\\b\\.){1,3},\\s*" + au.name)))
400 {
401 _author_string.replace(QRegExp("\\bJr.", Qt::CaseSensitive), "Jr");
402 _author_string.replace(".,", ".;");
403 // Reversed, comma separated 'Abrahamsson, A.-L., Springett, J., Karlsson, L., Ottosson, T.'
404 separator = ';';
405 is_string_reversed = true;
406 }
407 else if (_author_string.contains(QRegExp('^' + au.name + ',' + au.initials + ',')))
408 {
409 _author_string.replace(QRegExp("^([-'\\w]+),"), "\\1 ");
410 separator = ','; // Mixed naming 'Smith, J.-L., R. Jones, and K. Gibbons'
411 }
412 else
413 separator = " and "; // Reversed naming
414 }
415 else if (has_ands)
416 separator = " and ";
417 else // Natural naming
418 separator = ',';
419 }
420 else if (has_semicolon)
421 separator = ';'; // Multiple Authors, separated by semicolon
422 else
423 separator = " and ";
424 c2bUtils::debug(QObject::tr("Separator: |%1|").arg(separator));
425 c2bUtils::debug("1--|" + _author_string + '|');
426 _author_string.replace(QRegExp("\\band\\b", Qt::CaseInsensitive), separator);
427 _author_string.replace(QRegExp("\\s&\\s", Qt::CaseInsensitive), separator);
428 c2bUtils::debug("2--|" + _author_string + '|');
429 _author_string.remove(QRegExp("[^\\w\\.]+$")); // Removing of duplicate commas and semicolons
430 _author_string.replace(QRegExp(",\\s*"), ",");
431 c2bUtils::debug("3--|" + _author_string + '|');
432 _author_string.replace(QRegExp(",+"), ",");
433 _author_string.replace(QRegExp(";\\s*"), ";");
434 _author_string.replace(QRegExp(";+"), ";");
435 c2bUtils::debug("4--|" + _author_string + '|');
436 const bool are_authors_in_uppercase(containUpperCaseLetter(_author_string) &&
437 !containLowerCaseLetter(_author_string));
438 if (are_authors_in_uppercase)
439 c2bUtils::debug("Input Authors in Uppercase");
440 QStringList authors;
441 if (separator == " and ")
442 authors = _author_string.split(QRegExp("\\band\\b"));
443 else
444 authors = _author_string.split(separator);
445
446 // Setting author ordering
447 const QString first_author(authors.first().trimmed());
448 bool is_current_reversed(is_string_reversed || is_first_reversed || isReverseOrder(first_author));
449 const QString last_author(authors.last().trimmed());
450 const bool is_last_reversed(is_string_reversed || last_author.contains(au.reversed_name_rx) ||
451 isReverseOrder(last_author));
452 const bool is_string_mixed(is_current_reversed && !is_last_reversed);
453 if (is_string_mixed) // Mixed naming 'Smith, J., R. Jones'
454 c2bUtils::debug("Mixed order");
455
456 // Process each author name
457 for (int ai = 0; ai < authors.count(); ++ai)
458 {
459 QString author_i(authors.at(ai));
460 c2bUtils::debug(author_i);
461 author_i.replace(QRegExp("\\.{0,1}\\s{0,1}-"), "-"); // Abbreviated cases, eg M.-H. Something
462 author_i.replace(QRegExp("[^-'\\w,]"), " "); // Only these characters compose a name; keep commas
463 author_i = c2bUtils::simplifyString(author_i);
464
465 // Split author name
466 QStringList fore_name_parts;
467 QString last_name;
468 if (is_current_reversed)
469 {
470 const QStringList parts(author_i.split(',', QString::SkipEmptyParts));
471 const int nparts(parts.count());
472 if (nparts == 2)
473 {
474 QStringList p(parts.first().split(' ', QString::SkipEmptyParts));
475 if (p.count() > 0)
476 last_name = p.takeLast();
477 fore_name_parts = parts.last().split(' ', QString::SkipEmptyParts) + p;
478 }
479 else if (nparts == 3)
480 {
481 QStringList p(parts.first().split(' ', QString::SkipEmptyParts));
482 if (p.count() > 0)
483 last_name = p.takeLast();
484 fore_name_parts = parts.at(1).split(' ', QString::SkipEmptyParts) + p;
485 if (parts.last().contains(QRegExp("^(?:Jr|II|III|IV)$"))) // If otherwise, ignore it
486 last_name += '_' + parts.last();
487 }
488 else
489 {
490 fore_name_parts = author_i.split(' ', QString::SkipEmptyParts);
491 if (fore_name_parts.count() > 0)
492 last_name = fore_name_parts.takeFirst();
493 }
494 c2bUtils::debug("Reversed order");
495 }
496 else
497 {
498 fore_name_parts = author_i.split(' ', QString::SkipEmptyParts);
499 if (fore_name_parts.count() > 0)
500 last_name = fore_name_parts.takeLast();
501 c2bUtils::debug("Natural order");
502 }
503
504 // Process first and middle names
505 QString author_name;
506 for (int i = 0; i < fore_name_parts.count(); ++i)
507 {
508 c2bUtils::debug("First and Midle: " + fore_name_parts.at(i));
509 if (fore_name_parts.at(i).contains('-')) // Composite names
510 {
511 const QStringList fnpi(fore_name_parts.at(i).split('-'));
512 if (fnpi.count() > 1)
513 {
514 author_name += processFirstMiddle(fnpi.at(0)) + '-';
515 author_name += processFirstMiddle(fnpi.at(1)) + ' '; // Shouldn't be more than 2 parts...
516 }
517 }
518 else // Regular names
519 {
520 QString fore_name(fore_name_parts.at(i));
521 const int fore_length(fore_name.length());
522 const bool is_uppercase(!containLowerCaseLetter(fore_name));
523 if (fore_name_parts.count() == 1 && fore_length > 1 &&
524 !fore_name.contains(QRegExp("\\b" + au.double_initials + '_', Qt::CaseInsensitive)) &&
525 !are_authors_in_uppercase && is_uppercase)
526 {
527 // Cases 'Last, FST': Always abbreviated, no call to processFirstMiddle
528 for (int l = 0; l < fore_length; ++l)
529 author_name += fore_name[l] + ". ";
530 }
531 else if (fore_name_parts.count() == 2 && fore_length > 1 && fore_length < 3 && is_current_reversed &&
532 !are_authors_in_uppercase && is_uppercase)
533 {
534 // Cases 'Last1 Last2, FST': Always abbreviated, no call to processFirstMiddle
535 for (int l = 0; l < fore_length; ++l)
536 author_name += fore_name[l] + ". ";
537 }
538 else if (i == 1 && fore_name_parts.count() == 2 && fore_length > 1 && fore_length < 3 &&
539 !is_current_reversed && !are_authors_in_uppercase && is_uppercase)
540 {
541 // Cases 'Fore IJ Last': Process initials
542 for (int l = 0; l < fore_length; ++l)
543 author_name += fore_name[l] + ". ";
544 }
545 else
546 author_name += processFirstMiddle(fore_name) + ' ';
547 }
548 }
549 // Add last name
550 author_name += capitalize(last_name);
551 authors[ai] = author_name;
552 c2bUtils::debug(author_name);
553 if (is_string_mixed) // Mixed naming 'Smith, J., R. Jones'
554 is_current_reversed = false;
555 }
556
557 authors.removeAll(QString());
558 _author_string = authors.join(" and ");
559 // Restore Composite Names white spaces
560 _author_string.replace("_i_", " i ");
561 _author_string.replace(QRegExp("_II\\b", Qt::CaseInsensitive), " II"); // Suffix can be lower case here
562 _author_string.replace(QRegExp("_III\\b", Qt::CaseInsensitive), " III");
563 _author_string.replace(QRegExp("_IV\\b", Qt::CaseInsensitive), " IV");
564 _author_string.replace(QRegExp("_JR\\b", Qt::CaseInsensitive), " Jr");
565 _author_string.replace(QRegExp(au.prefixes + '_', Qt::CaseInsensitive), "\\1 ");
566 _author_string.replace(QRegExp("\\b" + au.double_initials + '_', Qt::CaseInsensitive), "\\1.");
567 _author_string = c2bUtils::simplifyString(_author_string);
568
569 return _author_string;
570 }
571
572 QString authorString::processFirstMiddle(const QString& first_middle) const
573 {
574 // Process First and Middle parts
575 // Abbreviates if required
576 // Takes care of abbreviation periods
577 QString proc_fm;
578 if (_full_form)
579 {
580 if (first_middle.length() > 1)
581 proc_fm = capitalize(first_middle);
582 else
583 proc_fm = first_middle + '.';
584 }
585 else
586 {
587 if (first_middle.contains('_')) // Composite names should not be abbreviated
588 {
589 proc_fm = capitalize(first_middle);
590 if (first_middle.length() - first_middle.indexOf('_') == 2)
591 proc_fm += '.';
592 }
593 else if (first_middle.length() > 0)
594 proc_fm = first_middle.at(0) + '.';
595 }
596 return proc_fm;
597 }
598
599 QString authorString::capitalize(const QString& name) const
600 {
601 // Capitalizes author's name
602 if (name.isEmpty())
603 return QString();
604 QString proc_name(name);
605 int ii(0);
606 const int prefixes(proc_name.count(QRegExp(au.prefixes + "_(?!(?:Jr|II|III|IV)\\b)", Qt::CaseInsensitive)));
607 for (int p = 0; p < prefixes; ++p)
608 {
609 const int iin(proc_name.indexOf('_', ii));
610 if (c2bUtils::isUpperCaseString(proc_name, ii, iin))
611 for (int i = 0; i < iin; ++i)
612 proc_name[i] = proc_name.at(i).toLower();
613 ii = std::min(iin + 1, proc_name.length() - 1);
614 }
615 if (c2bUtils::isUpperCaseString(proc_name, ii))
616 {
617 proc_name[ii] = proc_name.at(ii).toUpper();
618 for (int i = ++ii; i < proc_name.length(); ++i)
619 proc_name[i] = proc_name.at(i).toLower();
620 ii = proc_name.indexOf(QRegExp("[\\s-']")); // As before, assume just one part
621 if (ii++ > 0)
622 if (ii < proc_name.length())
623 proc_name[ii] = proc_name.at(ii).toUpper();
624 if (proc_name.startsWith("Mc"))
625 if (proc_name.length() > 4)
626 proc_name[2] = proc_name.at(2).toUpper();
627 }
628 return proc_name;
629 }
630
631 /** \page authorproc
632
633 Rules to identify ordering:
634 - Contains comma and semicolon -> Reverse
635 - Pattern <tt>'^Abcd,'</tt> -> Reverse
636 - Pattern <tt>'^Abcd EF Ghi'</tt> -> Natural
637 - Pattern <tt>'^Abcd EF'</tt> -> Reverse
638 - Pattern <tt>'^Abcd E.F.'</tt> -> Reverse
639 - Any other pattern -> Natural
640
641 */
642 bool authorString::isReverseOrder(const QString& author) const
643 {
644 // Returns true if Author Name is in reversed order as "Him DF, Her SR, "
645 // ISI doesn't contain point - return for safety
646 // Consider "Him DF Last"
647 const QString author_line(author.simplified());
648 QRegExp rRevNISI("^([-'\\w]+) ((\\w\\.\\s*)+)$");
649 rRevNISI.setMinimal(false);
650 if (rRevNISI.indexIn(author_line) > -1)
651 {
652 const QString Last(rRevNISI.cap(3));
653 if (Last != "and")
654 return true;
655 }
656 if (author_line.contains('.'))
657 return false;
658 rRevNISI = QRegExp("^([-'\\w]+) ([-'\\w]+) ([-'\\w]+)");
659 rRevNISI.setMinimal(false);
660 if (rRevNISI.indexIn(author_line) > -1)
661 {
662 const QString Last(rRevNISI.cap(3));
663 if (Last != "and")
664 return false;
665 }
666 rRevNISI = QRegExp("^([-'\\w]+) ([-\\w]{1,3})$"); // Consider only 1 to 3 initials
667 rRevNISI.setMinimal(false);
668 if (rRevNISI.indexIn(author_line) > -1)
669 {
670 const QString Last(rRevNISI.cap(1));
671 const QString First(rRevNISI.cap(2));
672 c2bUtils::debug(QObject::tr("ISI: |%1| |%2|").arg(Last, First));
673 if (containLowerCaseLetter(First))
674 return false;
675 if (!containLowerCaseLetter(Last))
676 return false;
677 return true;
678 }
679 return false;
680 }
681
682 bool authorString::containLowerCaseLetter(const QString& author) const
683 {
684 QString author_line(author);
685 author_line.remove(QRegExp("\\band\\b")); // Remove possible 'and' separator
686 author_line.remove(QRegExp(au.prefixes + '_', Qt::CaseInsensitive)); // Remove possible prefixes
687 author_line.remove(QRegExp(au.double_initials + '_', Qt::CaseSensitive)); // Remove possible two-letter initials
688 for (int i = 0; i < author_line.length(); i++)
689 {
690 if (author_line.at(i).isLetter())
691 if (author_line.at(i).category() == QChar::Letter_Lowercase)
692 return true;
693 }
694 return false;
695 }
696
697 bool authorString::containUpperCaseLetter(const QString& author)
698 {
699 for (int i = 0; i < author.length(); i++)
700 {
701 if (author.at(i).isLetter())
702 if (author.at(i).category() == QChar::Letter_Uppercase)
703 return true;
704 }
705 return false;
706 }