"Fossies" - the Fresh Open Source Software Archive 
Member "cb2bib-2.0.1/src/c2b/bibParser.cpp" (12 Feb 2021, 25908 Bytes) of package /linux/privat/cb2bib-2.0.1.tar.gz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style:
standard) with prefixed line numbers and
code folding option.
Alternatively you can here
view or
download the uninterpreted source code file.
For more information about "bibParser.cpp" see the
Fossies "Dox" file reference documentation and the latest
Fossies "Diffs" side-by-side code changes report:
2.0.0_vs_2.0.1.
1 /***************************************************************************
2 * Copyright (C) 2004-2021 by Pere Constans
3 * constans@molspaces.com
4 * cb2Bib version 2.0.1. Licensed under the GNU GPL version 3.
5 * See the LICENSE file that comes with this distribution.
6 ***************************************************************************/
7 #include "bibParser.h"
8
9 #include "arxivXml.h"
10 #include "bibPreparser.h"
11 #include "crJson.h"
12 #include "document.h"
13 #include "heuristicBibParser.h"
14 #include "idMaker.h"
15 #include "preprocess.h"
16 #include "pubmedXml.h"
17 #include "settings.h"
18
19 #include <QCoreApplication>
20 #include <QUrl>
21
22
23 bibParser::bibParser(QObject* parento) : coreBibParser(parento)
24 {
25 // Creating journal name database
26 _journal_dbP = new journalDB(_settingsP->fileName("cb2Bib/JournalFile"));
27
28 // Creating month list
29 _month_dbP = new monthDB();
30
31 // Creating cite and document ID makers
32 _cite_idmP = new idMaker("cb2Bib/CiteIdPattern", this);
33 _file_idmP = new idMaker("cb2Bib/DocumentIdPattern", this);
34
35 // Creating (external) reference preparser
36 _preparserP = new bibPreparser(this);
37 connect(_preparserP, SIGNAL(statusMessage(QString)), this, SIGNAL(statusMessage(QString)));
38
39 // Creating stream preprocess object
40 _preprocessP = new preprocess(this);
41
42 // Creating heuristic bibliographic parser
43 _heuristic_parserP = new heuristicBibParser(this);
44 }
45
46 bibParser::~bibParser()
47 {
48 delete _journal_dbP;
49 delete _month_dbP;
50 delete _heuristic_parserP;
51 }
52
53 /**
54 Process each field and set its final format
55 */
56 QString bibParser::parse(const QString& field, const QString& value, const QString& init_value)
57 {
58 QString v(value);
59 if (field == QLatin1String("file"))
60 return v.trimmed();
61 if (field == "abstract")
62 document::normalize(v, document::Complete);
63 v = removeTags(v);
64 if (v.isEmpty())
65 return v;
66 c2bUtils::fullBibToC2b(v);
67 if (field == QLatin1String("author"))
68 {
69 c2bUtils::debug(v);
70 v = _authorString.toBibTeX(v, _settingsP->value("cb2Bib/UseFullNames").toBool());
71 }
72 else if (field == QLatin1String("addauthors"))
73 {
74 c2bUtils::debug(v);
75 if (init_value.isEmpty())
76 v = _authorString.toBibTeX(v, _settingsP->value("cb2Bib/UseFullNames").toBool());
77 else
78 v = init_value + " and " + _authorString.toBibTeX(v, _settingsP->value("cb2Bib/UseFullNames").toBool());
79 }
80 else if (field == QLatin1String("editor"))
81 {
82 c2bUtils::debug(v);
83 v.remove(QRegExp("\\((Editor|Editors|Ed|Eds)\\.{0,1}\\)", Qt::CaseInsensitive));
84 v = _authorString.toBibTeX(v, _settingsP->value("cb2Bib/UseFullNames").toBool());
85 }
86 else if (field == QLatin1String("addeditors"))
87 {
88 c2bUtils::debug(v);
89 v.remove(QRegExp("\\((Editor|Editors|Ed|Eds)\\.{0,1}\\)", Qt::CaseInsensitive));
90 if (init_value.isEmpty())
91 v = _authorString.toBibTeX(v, _settingsP->value("cb2Bib/UseFullNames").toBool());
92 else
93 v = init_value + " and " + _authorString.toBibTeX(v, _settingsP->value("cb2Bib/UseFullNames").toBool());
94 }
95 else if (field == QLatin1String("doi"))
96 {
97 v.remove(QRegExp("^.+(?=10\\.[\\d\\.]+/\\S+)"));
98 if (v.endsWith(QLatin1Char('.')) || v.endsWith(QLatin1Char(',')) || v.endsWith(QLatin1Char(';')))
99 v.chop(1);
100 }
101 else if (field == QLatin1String("url"))
102 {
103 // Remove redundant DOI URLs
104 if (v.contains("doi.org/10."))
105 v.clear();
106 }
107 else if (field == QLatin1String("isbn"))
108 v.remove(' ');
109 else if (field == QLatin1String("journal"))
110 {
111 if (_settingsP->value("cb2Bib/SetJournalsToFullname").toBool())
112 v = fullJournal(v);
113 else
114 v = abbreviatedJournal(v);
115 }
116 else if (field == QLatin1String("keywords"))
117 {
118 v.replace(" - ", ",");
119 QStringList kl(v.split(QRegExp("[^\\w\\s-']"), QString::SkipEmptyParts));
120 kl.removeAll(" ");
121 kl.removeDuplicates();
122 v = kl.join(", ");
123 v.replace(QRegExp("\\s+,"), ",");
124 v = v.toLower();
125 }
126 else if (field == QLatin1String("month"))
127 v = _month_dbP->retrieve(v);
128 // Fields edition and note require first letter capitalization
129 else if (field == QLatin1String("edition"))
130 v = c2bUtils::setCapitalization(v);
131 else if (field == QLatin1String("note"))
132 v = c2bUtils::setCapitalization(v);
133 // Process pages, volume, number, and year to set hyphenation
134 else if (field == QLatin1String("pages"))
135 v = adjacentNumbers(v);
136 else if (field == QLatin1String("volume"))
137 v = adjacentNumbers(v);
138 else if (field == QLatin1String("number"))
139 v = adjacentNumbers(v);
140 else if (field == QLatin1String("year"))
141 v = adjacentNumbers(v);
142 else if (field == QLatin1String("title") || field == QLatin1String("booktitle"))
143 v = c2bUtils::setCapitalization(v);
144 else if (field == QLatin1String("addtitle"))
145 {
146 if (!init_value.isEmpty())
147 v = init_value + QLatin1String(": ") + v;
148 v = c2bUtils::setCapitalization(v);
149 }
150 return c2bUtils::simplifyString(v);
151 }
152
153 /**
154 Process each field and set its final format for a complete reference
155 */
156 bibReference& bibParser::parse(bibReference& reference)
157 {
158 QMutableHashIterator<QString, QString> i(reference);
159 while (i.hasNext())
160 {
161 i.next();
162 const QString v(parse(i.key(), i.value()));
163 i.setValue(v);
164 }
165 return reference;
166 }
167
168 QString bibParser::setJournalsToFull(const QString& text, const bool alternate)
169 {
170 const bool ConvertReferenceToLaTeX(_settingsP->value("cb2Bib/ConvertReferenceToLaTeX").toBool());
171 QString substituted_text(text);
172 QRegExp jnre("\\bjournal\\s*=\\s*[\\{\"]", Qt::CaseInsensitive);
173 QString jn;
174 int pos(0);
175 uint nj(0);
176 while (pos >= 0)
177 {
178 pos = jnre.indexIn(substituted_text, pos);
179 if (pos > 0 && c2bUtils::inBraces(pos + jnre.matchedLength(), substituted_text, &jn))
180 {
181 const int jnlength(jn.length());
182 c2bUtils::bibToC2b(jn);
183 jn = alternate ? alternateFullJournal(jn) : fullJournal(jn);
184 if (ConvertReferenceToLaTeX)
185 c2bUtils::c2bToBib(jn);
186 pos += jnre.matchedLength();
187 substituted_text.replace(pos, jnlength, jn);
188 nj++;
189 }
190 else if (pos >= 0)
191 ++pos;
192 }
193 emit statusMessage(tr("Processed %1 journal names.").arg(nj));
194 return substituted_text;
195 }
196
197 QString bibParser::setJournalsToAbbreviated(const QString& text, const bool alternate)
198 {
199 const bool ConvertReferenceToLaTeX(_settingsP->value("cb2Bib/ConvertReferenceToLaTeX").toBool());
200 QString substituted_text(text);
201 QRegExp jnre("\\bjournal\\s*=\\s*[\\{\"]", Qt::CaseInsensitive);
202 QString jn;
203 int pos(0);
204 uint nj(0);
205 while (pos >= 0)
206 {
207 pos = jnre.indexIn(substituted_text, pos);
208 if (pos > 0 && c2bUtils::inBraces(pos + jnre.matchedLength(), substituted_text, &jn))
209 {
210 const int jnlength(jn.length());
211 c2bUtils::bibToC2b(jn);
212 jn = alternate ? alternateAbbreviatedJournal(jn) : abbreviatedJournal(jn);
213 if (ConvertReferenceToLaTeX)
214 c2bUtils::c2bToBib(jn);
215 pos += jnre.matchedLength();
216 substituted_text.replace(pos, jnlength, jn);
217 nj++;
218 }
219 else if (pos >= 0)
220 ++pos;
221 }
222 emit statusMessage(tr("Processed %1 journal names.").arg(nj));
223 return substituted_text;
224 }
225
226 QString bibParser::excerpt(const QString& text, const QStringList& hints) const
227 {
228 QString txt(removeTags(text));
229 txt.replace(QRegExp("\\[Bibliographic Metadata.+/Bibliographic Metadata\\]"), " ");
230 txt.replace(QRegExp("(http://|https://|ftp://|www\\.|ftp\\.)(www\\.|ftp\\.){0,1}\\S+"), " ");
231 txt.replace(QRegExp("\\b[A-Z]+\\b"), " ");
232 txt.replace(QRegExp("\\d"), " ");
233 txt.replace(c2bUtils::nonLetter, " ");
234 txt.replace(QRegExp("\\b\\w{1,2}\\b"), " ");
235 txt.replace(
236 QRegExp("\\b(about|and|are|com|for|from|how|into|that|the|their|this|where|with|www)\\b", Qt::CaseInsensitive),
237 " ");
238 txt.replace(QRegExp("\\b(january|february|march|april|may|june|july|august|september|october|november|december)\\b",
239 Qt::CaseInsensitive),
240 " ");
241 txt = hints.join(" ") + ' ' + txt;
242 txt = c2bUtils::simplifyString(txt);
243 const QStringList txtlist(txt.split(' ', QString::SkipEmptyParts));
244 QStringList txtlistSimp;
245 for (int i = 0; i < std::min(15, txtlist.count()); ++i)
246 txtlistSimp.append(txtlist.at(i));
247 return txtlistSimp.join(" ");
248 }
249
250 void bibParser::setField(const QString& name, const QString& value)
251 {
252 _current_reference[name] = parse(name, value);
253 }
254
255 void bibParser::setReferenceType(const QString& type)
256 {
257 _current_reference.typeName = type;
258 }
259
260 void bibParser::setCiteID()
261 {
262 _current_reference.citeidName = _cite_idmP->makeID(_current_reference);
263 }
264
265 QString bibParser::documentFilename(const QString& base_fn, const QString& bibtex_fn)
266 {
267 if (base_fn.isEmpty())
268 return base_fn;
269 if (!_settingsP->value("cb2Bib/MovePdf").toBool())
270 {
271 const QUrl u(base_fn);
272 const QString scheme(u.scheme());
273 if (scheme == "file")
274 return parse("file", u.toLocalFile());
275 else
276 return parse("file", base_fn);
277 }
278 QString filename(_file_idmP->makeID(_current_reference));
279 if (filename.isEmpty())
280 filename = "no_cite_id";
281 QFileInfo fi(base_fn);
282 QString docExtension('.' + fi.suffix().toLower());
283 // Possible document extensions
284 if (!docExtension.contains(QRegExp("^\\.\\w{2,4}$")))
285 docExtension = ".pdf"; // Default
286 if (docExtension == ".gz")
287 if (fi.completeSuffix().toLower() == ".ps.gz")
288 docExtension = ".ps.gz"; // Composite extension
289 filename = c2bUtils::documentFilename(_settingsP->value("cb2Bib/RelativePdfDirectory").toBool(), bibtex_fn,
290 _settingsP->fileName("cb2Bib/PdfDirectory"), filename + docExtension);
291 return parse("file", filename);
292 }
293
294
295 /****************************************************************************
296
297 AUTOMATIC BIB CAPTION
298
299 *****************************************************************************/
300
301 void bibParser::preparse(const QString& text, QString* out_text)
302 {
303 _preparserP->preparse(text, out_text);
304 }
305
306 /** \page clipboard Extracting Data from the Clipboard
307
308 Clipboard contents is processed according to the following rules:
309
310 - Perform external, user-defined preparsing on input stream. See \ref c2bconf_clipboard.
311
312 - Perform user-defined substitutions on input stream. See \ref c2bconf_clipboard.
313
314 - Check if input stream is already a BibTeX entry. If so, process entry.
315
316 - Check if input stream is, in this order of preference, a PubMed XML,
317 arXiv XML, CR JSON, or Medline entry. If so, process entry.
318
319 - Preprocess author names: PI JOAN III -> Pi III, J.
320 (care of name prefixes, suffixes, and removal of ambiguities).
321
322
323 If otherwise,
324
325 - Extract DOI \n (DOI, URL and FILE/PDF are preprocessed, performed before
326 the automatic recognition takes place.)
327
328 - Extract URL
329
330 - Remove leading and trailing white spaces, TABs and CRs.
331
332 - "\r\n", "\n" and/or "\r" replaced by the line indicator tag <code><NewLineN></code>.
333
334 - Replace "\t" and ten or more consecutive "\s" by the tabular tag <code><TabN></code>.
335
336 - Simplify white spaces
337
338 - Start the automatic recognition engine.
339
340
341 If the automatic recognition engine fails, optionally, a heuristic guessing
342 will be performed.
343
344
345 See also \ref heuristics and \ref metadata.
346
347 */
348 void bibParser::parse(const QString& text, QString* out_text, QString* out_tagtext)
349 {
350 QString& otext = *out_text;
351 otext = text;
352 QString& ottext = *out_tagtext;
353 ottext.clear();
354 _preprocessP->preprocessText(out_text);
355
356 if (hasBibTeX(otext))
357 {
358 _current_reference = wholeReference(otext);
359 parse(_current_reference);
360 currentReferenceUpdated();
361 _auto_recognized_string = tr("Processed as 'BibTeX'.");
362 _auto_recognized = true;
363 emit statusMessage(_auto_recognized_string);
364 return;
365 }
366
367 const QString pubmed_identifier("<PubmedArticle>");
368 if (otext.contains(pubmed_identifier))
369 {
370 pubmedXml pxml(metadataSection(otext, pubmed_identifier));
371 if (pxml.hasError())
372 {
373 _auto_recognized_string = tr("Error parsing 'PubMed XML': %1.").arg(pxml.errorString());
374 _auto_recognized = false;
375 }
376 else
377 {
378 _current_reference = pxml.reference();
379 parse(_current_reference);
380 currentReferenceUpdated();
381 _auto_recognized_string = tr("Processed as 'PubMed XML'.");
382 _auto_recognized = true;
383 }
384 emit statusMessage(_auto_recognized_string);
385 return;
386 }
387
388 const QString arxiv_identifier("arxiv.org/api/");
389 const QString cr_identifier("{\"status\":\"ok\",\"message-type\":\"work\"");
390 if (otext.contains(arxiv_identifier) || otext.contains(cr_identifier))
391 {
392 if (otext.contains(cr_identifier))
393 {
394 crJson crjson(metadataSection(otext, cr_identifier));
395 if (crjson.hasError())
396 {
397 _auto_recognized_string = tr("Error parsing 'CR JSON': %1.").arg(crjson.errorString());
398 _auto_recognized = false;
399 emit statusMessage(_auto_recognized_string);
400 return;
401 }
402 else
403 {
404 _current_reference = crjson.reference();
405 _auto_recognized_string = tr("Processed as 'CR JSON'.");
406 }
407 }
408 if (otext.contains(arxiv_identifier))
409 {
410 arxivXml axml(metadataSection(otext, arxiv_identifier));
411 if (axml.hasError())
412 {
413 _auto_recognized_string = tr("Error parsing 'arXiv XML': %1.").arg(axml.errorString());
414 _auto_recognized = false;
415 if (_current_reference.size() > 0)
416 _current_reference.clearReference();
417 emit statusMessage(_auto_recognized_string);
418 return;
419 }
420 else
421 {
422 if (_current_reference.size() == 0)
423 {
424 _current_reference = axml.reference();
425 _auto_recognized_string = tr("Processed as 'arXiv XML'.");
426 }
427 else
428 {
429 // Merging arXiv and CR metadata
430 const bibReference& areference(axml.reference());
431 if (!areference.value("abstract").isEmpty())
432 _current_reference["abstract"] = areference.value("abstract");
433 if (!areference.value("title").isEmpty())
434 _current_reference["title"] = areference.value("title");
435 if (!areference.value("eprint").isEmpty())
436 _current_reference["eprint"] = areference.value("eprint");
437 if (!areference.value("url").isEmpty())
438 _current_reference["url"] = areference.value("url");
439 _auto_recognized_string = tr("Processed as merged 'arXiv XML' and 'CR JSON'.");
440 }
441 }
442 }
443 parse(_current_reference);
444 currentReferenceUpdated();
445 _auto_recognized = true;
446 emit statusMessage(_auto_recognized_string);
447 return;
448 }
449
450 if (otext.contains(QRegExp("^\\s*PMID\\s*-")))
451 {
452 _current_reference.typeName = "article";
453 otext = ' ' + otext;
454 ottext = otext;
455 // http://www.nlm.nih.gov/bsd/mms/medlineelements.html
456 ottext.replace(QRegExp("[\\n\\r]\\s*([A-Z]{2,4}\\s*-)"), "][\\1"); // Two to four capital letter in field tags
457 ottext = c2bUtils::simplifyString(ottext);
458 if (!ottext.contains(QRegExp("\\[FAU\\s+-")))
459 ottext.replace(QRegExp("\\[(AU\\s*-\\s*[-'\\w]+)"), "[F\\1 ");
460 QStringList fList(ottext.split("]["));
461 QString kw;
462 QRegExp fld("^([A-Z]{2,4})\\s{0,1}-\\s*(.+)$");
463 fld.setPatternSyntax(QRegExp::RegExp2);
464 for (QStringList::Iterator it = fList.begin(); it != fList.end(); ++it)
465 {
466 if (fld.indexIn(*it) == -1)
467 continue;
468 const QString tag(fld.cap(1));
469 QString value(fld.cap(2));
470 if (tag == "AB")
471 _current_reference["abstract"] = parse("abstract", value);
472 else if (tag == "FAU")
473 _current_reference["author"] =
474 parse("addauthors", authorFromMedline(value), _current_reference.value("author"));
475 else if (tag == "TA")
476 _current_reference["journal"] = parse("journal", value);
477 else if (tag == "IP")
478 _current_reference["number"] = parse("number", value);
479 else if (tag == "PG")
480 _current_reference["pages"] = parse("pages", value);
481 else if (tag == "TI")
482 _current_reference["title"] = parse("title", value);
483 else if (tag == "PMID")
484 _current_reference["url"] = parse("url", c2bUtils::pubmedUrl.arg(value));
485 else if (tag == "VI")
486 _current_reference["volume"] = parse("volume", value);
487 else if (tag == "AID")
488 {
489 if (value.contains("[doi]"))
490 _current_reference["doi"] = parse("doi", value.remove("[doi]"));
491 }
492 else if (tag == "DP")
493 _current_reference["year"] = parse("year", value.replace(QRegExp("^([\\d\\s]+).*$"), "\\1"));
494 else if (tag == "MH")
495 kw += "; " + value.trimmed();
496 }
497 if (!kw.isEmpty())
498 _current_reference["keywords"] = parse("keywords", kw.remove(0, 2));
499 currentReferenceUpdated();
500 _auto_recognized_string = tr("Processed as 'PubMed - Medline Journals'.");
501 _auto_recognized = true;
502 emit statusMessage(_auto_recognized_string);
503 return;
504 }
505
506 _heuristic_parserP->heuristicFields(otext);
507
508 // Set tags and start regular expression extraction
509 ottext = setTags(otext);
510 QString regular_expression_f(_settingsP->fileName("cb2Bib/RegularExpressionFile"));
511 checkRegExpFile(regular_expression_f);
512 QFile file(regular_expression_f);
513 file.open(QIODevice::ReadOnly | QIODevice::Text);
514 QString ItemX;
515 QString line;
516 QString reftype;
517 QString fieldset;
518 QTextStream stream(&file);
519 stream.setCodec("UTF-8");
520 stream.setAutoDetectUnicode(true);
521 int nfilters(0);
522
523 while (!stream.atEnd())
524 {
525 line = stream.readLine();
526 if (!(line.isEmpty() || line.contains(QRegExp("^#"))))
527 {
528 reftype = stream.readLine();
529 fieldset = stream.readLine();
530 ItemX = stream.readLine();
531
532 c2bUtils::debug(tr("The RegExp file contains1: |%1|").arg(line));
533 c2bUtils::debug(tr("The RegExp file contains2: |%1|").arg(reftype));
534 c2bUtils::debug(tr("The RegExp file contains3: |%1|").arg(fieldset));
535 c2bUtils::debug(tr("The RegExp file contains4: |%1|").arg(ItemX));
536
537 QRegExp rx(ItemX);
538 rx.setMinimal(true);
539 if (!rx.isValid())
540 c2bUtils::warn(tr("RegExp |%1| is not valid").arg(ItemX));
541
542 const QStringList list(fieldset.split(' ', QString::SkipEmptyParts));
543 const int efields(list.count());
544 const int cfields(rx.captureCount());
545 int npos(rx.indexIn(ottext));
546 c2bUtils::debug(tr("Expected Fields: |%1|").arg(efields));
547 c2bUtils::debug(tr("Captured Fields: |%1|").arg(cfields));
548 c2bUtils::debug(tr("Position: |%1|").arg(npos));
549 if (efields != cfields)
550 {
551 c2bUtils::warn(
552 tr("RegExp |%1| is not valid. Mismatch between expected and actual captures").arg(ItemX));
553 npos = -1;
554 }
555 nfilters++;
556
557 if (npos > -1)
558 {
559 for (int i = 0; i < cfields; i++)
560 {
561 const QString& listi = list.at(i);
562 int ii(i + 1);
563 c2bUtils::debug(QString("Fields in Template %1: |%2|").arg(i).arg(rx.cap(ii)));
564 if (_field_re.indexIn(listi) > -1)
565 {
566 if (listi == "author")
567 // Reminder: "addauthors" requires to init _current_reference["author"]
568 _current_reference[listi] =
569 parse("addauthors", rx.cap(ii), _current_reference.value(listi));
570 else if (listi == "editor")
571 // Reminder: "addeditors" requires to init _current_reference["editor"]
572 _current_reference[listi] =
573 parse("addeditors", rx.cap(ii), _current_reference.value(listi));
574 else if (listi == "title")
575 // Reminder: "addtitle" requires to init _current_reference["title"]
576 _current_reference[listi] = parse("addtitle", rx.cap(ii), _current_reference.value(listi));
577 else
578 _current_reference[listi] = parse(listi, rx.cap(ii));
579 }
580 }
581 _current_reference.typeName = reftype;
582 currentReferenceUpdated();
583 _auto_recognized_string = tr("Processed as '%1'.").arg(line);
584 _auto_recognized = true;
585 emit statusMessage(_auto_recognized_string);
586 file.close();
587 return;
588 }
589 }
590 }
591 file.close();
592
593 // Heuristic Bib Parsing
594 if (_settingsP->value("cb2Bib/DoHeuristicGuess").toBool())
595 {
596 // Sometimes (if user is on tag mode) tag could be on otext. Revert tags here, just in case.
597 const QString clean_text(removeTags(ottext));
598 _heuristic_parserP->guessFields(clean_text, ottext);
599 currentReferenceUpdated();
600 _auto_recognized_string =
601 tr("Applied %1 filters: No automatic format detection. %2 fields guessed.").arg(nfilters).arg(fieldCount());
602 }
603 else
604 _auto_recognized_string = tr("Applied %1 filters: No automatic format detection.").arg(nfilters);
605 emit statusMessage(_auto_recognized_string);
606 }
607
608 void bibParser::checkRegExpFile(const QString& fn)
609 {
610 if (fn.isEmpty())
611 {
612 c2bUtils::warn(tr("No regular expression file especified"));
613 return;
614 }
615 QFileInfo fi(fn);
616 if (!fi.exists() || !fi.isReadable())
617 {
618 c2bUtils::warn(tr("Could not open regular expression file %1 for reading").arg(fn));
619 return;
620 }
621 }
622
623 void bibParser::guessFields(const QString& text)
624 {
625 const QString clean_text(text.simplified());
626 const QString tagged_text(setTags(text));
627 _heuristic_parserP->heuristicFields(clean_text);
628 _heuristic_parserP->guessFields(clean_text, tagged_text);
629 currentReferenceUpdated();
630 _auto_recognized_string = tr("%1 fields guessed.").arg(fieldCount());
631 emit statusMessage(_auto_recognized_string);
632 }
633
634 QString bibParser::setTags(const QString& text) const
635 {
636 QString tagged_text(text.trimmed());
637 tagged_text.replace(QRegExp("\\r\\n"), "<found_new_line>"); // Windows new line
638 tagged_text.replace(QRegExp("\\n"), "<found_new_line>"); // Linux new line, LF
639 tagged_text.replace(QRegExp("\\r"), "<found_new_line>"); // OSX new line, CR
640 QStringList spText(tagged_text.split("<found_new_line>"));
641 int n(spText.count());
642 tagged_text.clear();
643 for (int i = 0; i < n - 1; i++)
644 tagged_text += spText.at(i) + QString("<NewLine%1>").arg(i + 1);
645 tagged_text += spText[n - 1];
646 spText = tagged_text.split(QRegExp("(\\s{10,}|\\t)"));
647 n = spText.count();
648 tagged_text.clear();
649 for (int i = 0; i < n - 1; i++)
650 tagged_text += spText.at(i) + QString("<Tab%1>").arg(i + 1);
651 tagged_text += spText[n - 1];
652 tagged_text = c2bUtils::simplifyString(tagged_text);
653 return tagged_text;
654 }
655
656 QString bibParser::removeTags(const QString& text) const
657 {
658 QString clean(text);
659 clean.remove("[[");
660 clean.remove("]]");
661 clean.replace(QRegExp("<NewLine\\d+>"), " ");
662 clean.replace(QRegExp("<Tab\\d+>"), " ");
663 clean = c2bUtils::simplifyString(clean);
664 return clean;
665 }
666
667 QString bibParser::metadataSection(const QString& text, const QString& identifier) const
668 {
669 const QStringList lines(text.split(QRegExp("[\\r\\n]"), QString::KeepEmptyParts));
670 const int nl(lines.count());
671 int il(-1);
672 int sl(0);
673 int el(0);
674 for (int l = 0; l < nl; ++l)
675 if (lines.at(l).contains(identifier))
676 {
677 il = l;
678 break;
679 }
680 if (il == -1)
681 return QString();
682 for (int l = il + 1; l < nl; ++l)
683 if (lines.at(l) == "/Raw Metadata]")
684 {
685 el = l;
686 break;
687 }
688 for (int l = il; l >= 0; --l)
689 if (lines.at(l) == "[Raw Metadata")
690 {
691 sl = l;
692 break;
693 }
694 if (sl == 0 && el == 0)
695 return text.trimmed();
696 if (sl == 0 || el == 0)
697 return QString();
698 QString section;
699 for (int l = sl + 1; l < el; ++l)
700 section += '\n' + lines.at(l);
701 return section.trimmed();
702 }