w32tex
About: TeX Live provides a comprehensive TeX system including all the major TeX-related programs, macro packages, and fonts that are free software. Windows sources.
  Fossies Dox: w32tex-src.tar.xz  ("unofficial" and yet experimental doxygen-generated source code documentation)  

ugrep.cpp
Go to the documentation of this file.
1 /*************************************************************************
2 *
3 * © 2016 and later: Unicode, Inc. and others.
4 * License & terms of use: http://www.unicode.org/copyright.html
5 *
6 **************************************************************************
7 **************************************************************************
8 *
9 * Copyright (C) 2002-2010, International Business Machines
10 * Corporation and others. All Rights Reserved.
11 *
12 ***************************************************************************
13 */
14 
15 //
16 // ugrep - an ICU sample program illustrating the use of ICU Regular Expressions.
17 //
18 // The use of the ICU Regex API all occurs within the main()
19 // function. The rest of the code deals with opening files,
20 // encoding conversions, printing results, etc.
21 //
22 // This is not a full-featured grep program. The command line options
23 // have been kept to a minimum to avoid complicating the sample code.
24 //
25 
26 
27 
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 
32 #include "unicode/utypes.h"
33 #include "unicode/ustring.h"
34 #include "unicode/regex.h"
35 #include "unicode/ucnv.h"
36 #include "unicode/uclean.h"
37 
38 using namespace icu;
39 
40 //
41 // The following variables contain parameters that may be set from the command line.
42 //
43 const char *pattern = NULL; // The regular expression
44 int firstFileNum; // argv index of the first file name
47 
48 
49 //
50 // Info regarding the file currently being processed
51 //
52 const char *fileName;
53 int fileLen; // Length, in UTF-16 Code Units.
54 
55 UChar *ucharBuf = 0; // Buffer, holds converted file. (Simple minded program, always reads
56  // the whole file at once.
57 
58 char *charBuf = 0; // Buffer, for original, unconverted file data.
59 
60 
61 //
62 // Info regarding the line currently being processed
63 //
64 int lineStart; // Index of first char of the current line in the file buffer
65 int lineEnd; // Index of char following the new line sequence for the current line
66 int lineNum;
67 
68 //
69 // Converter, used on output to convert Unicode data back to char *
70 // so that it will display in non-Unicode terminal windows.
71 //
73 
74 //
75 // Function forward declarations
76 //
77 void processOptions(int argc, const char **argv);
78 void nextLine(int start);
79 void printMatch();
80 void printUsage();
81 void readFile(const char *name);
82 
83 
84 
85 //------------------------------------------------------------------------------------------
86 //
87 // main for ugrep
88 //
89 // Structurally, all use of the ICU Regular Expression API is in main(),
90 // and all of the supporting stuff necessary to make a running program, but
91 // not directly related to regular expressions, is factored out into these other
92 // functions.
93 //
94 //------------------------------------------------------------------------------------------
95 int main(int argc, const char** argv) {
96  UBool matchFound = false;
97 
98  //
99  // Process the command line options.
100  //
102 
103  //
104  // Create a RegexPattern object from the user supplied pattern string.
105  //
106  UErrorCode status = U_ZERO_ERROR; // All ICU operations report success or failure
107  // in a status variable.
108 
109  UParseError parseErr; // In the event of a syntax error in the regex pattern,
110  // this struct will contain the position of the
111  // error.
112 
113  RegexPattern *rePat = RegexPattern::compile(pattern, parseErr, status);
114  // Note that C++ is doing an automatic conversion
115  // of the (char *) pattern to a temporary
116  // UnicodeString object.
117  if (U_FAILURE(status)) {
118  fprintf(stderr, "ugrep: error in pattern: \"%s\" at position %d\n",
119  u_errorName(status), parseErr.offset);
120  exit(-1);
121  }
122 
123  //
124  // Create a RegexMatcher from the newly created pattern.
125  //
126  UnicodeString empty;
127  RegexMatcher *matcher = rePat->matcher(empty, status);
128  if (U_FAILURE(status)) {
129  fprintf(stderr, "ugrep: error in creating RegexMatcher: \"%s\"\n",
131  exit(-1);
132  }
133 
134  //
135  // Loop, processing each of the input files.
136  //
137  for (int fileNum=firstFileNum; fileNum < argc; fileNum++) {
138  readFile(argv[fileNum]);
139 
140  //
141  // Loop through the lines of a file, trying to match the regex pattern on each.
142  //
144  UnicodeString s(false, ucharBuf+lineStart, lineEnd-lineStart);
145  matcher->reset(s);
146  if (matcher->find()) {
147  matchFound = true;
148  printMatch();
149  }
150  }
151  }
152 
153  //
154  // Clean up
155  //
156  delete matcher;
157  delete rePat;
158  free(ucharBuf);
159  free(charBuf);
161 
162  u_cleanup(); // shut down ICU, release any cached data it owns.
163 
164  return matchFound? 0: 1;
165 }
166 
167 
168 
169 //------------------------------------------------------------------------------------------
170 //
171 // doOptions Run through the command line options, and set
172 // the global variables accordingly.
173 //
174 // exit without returning if an error occurred and
175 // ugrep should not proceed further.
176 //
177 //------------------------------------------------------------------------------------------
178 void processOptions(int argc, const char **argv) {
179  int optInd;
180  UBool doUsage = false;
181  UBool doVersion = false;
182  const char *arg;
183 
184 
185  for(optInd = 1; optInd < argc; ++optInd) {
186  arg = argv[optInd];
187 
188  /* version info */
189  if(strcmp(arg, "-V") == 0 || strcmp(arg, "--version") == 0) {
190  doVersion = true;
191  }
192  /* usage info */
193  else if(strcmp(arg, "--help") == 0) {
194  doUsage = true;
195  }
196  else if(strcmp(arg, "-n") == 0 || strcmp(arg, "--line-number") == 0) {
197  displayLineNum = true;
198  }
199  /* POSIX.1 says all arguments after -- are not options */
200  else if(strcmp(arg, "--") == 0) {
201  /* skip the -- */
202  ++optInd;
203  break;
204  }
205  /* unrecognized option */
206  else if(strncmp(arg, "-", strlen("-")) == 0) {
207  printf("ugrep: invalid option -- %s\n", arg+1);
208  doUsage = true;
209  }
210  /* done with options */
211  else {
212  break;
213  }
214  }
215 
216  if (doUsage) {
217  printUsage();
218  exit(0);
219  }
220 
221  if (doVersion) {
222  printf("ugrep version 0.01\n");
223  if (optInd == argc) {
224  exit(0);
225  }
226  }
227 
228  int remainingArgs = argc-optInd; // pattern file ...
229  if (remainingArgs < 2) {
230  fprintf(stderr, "ugrep: files or pattern are missing.\n");
231  printUsage();
232  exit(1);
233  }
234 
235  if (remainingArgs > 2) {
236  // More than one file to be processed. Display file names with match output.
237  displayFileName = true;
238  }
239 
240  pattern = argv[optInd];
241  firstFileNum = optInd+1;
242 }
243 
244 //------------------------------------------------------------------------------------------
245 //
246 // printUsage
247 //
248 //------------------------------------------------------------------------------------------
249 void printUsage() {
250  printf("ugrep [options] pattern file...\n"
251  " -V or --version display version information\n"
252  " --help display this help and exit\n"
253  " -- stop further option processing\n"
254  "-n, --line-number Prefix each line of output with the line number within its input file.\n"
255  );
256  exit(0);
257 }
258 
259 //------------------------------------------------------------------------------------------
260 //
261 // readFile Read a file into memory, and convert it to Unicode.
262 //
263 // Since this is just a demo program, take the simple minded approach
264 // of always reading the whole file at once. No intelligent buffering
265 // is done.
266 //
267 //------------------------------------------------------------------------------------------
268 void readFile(const char *name) {
269 
270  //
271  // Initialize global file variables
272  //
273  fileName = name;
274  fileLen = 0; // zero length prevents processing in case of errors.
275 
276 
277  //
278  // Open the file and determine its size.
279  //
280  FILE *file = fopen(name, "rb");
281  if (file == 0 ) {
282  fprintf(stderr, "ugrep: Could not open file \"%s\"\n", fileName);
283  return;
284  }
285  fseek(file, 0, SEEK_END);
286  int rawFileLen = ftell(file);
287  fseek(file, 0, SEEK_SET);
288 
289 
290  //
291  // Read in the file
292  //
293  charBuf = (char *)realloc(charBuf, rawFileLen+1); // Need error checking...
294  int t = static_cast<int>(fread(charBuf, 1, rawFileLen, file));
295  if (t != rawFileLen) {
296  fprintf(stderr, "Error reading file \"%s\"\n", fileName);
297  fclose(file);
298  return;
299  }
300  charBuf[rawFileLen]=0;
301  fclose(file);
302 
303  //
304  // Look for a Unicode Signature (BOM) in the data
305  //
306  int32_t signatureLength;
307  const char * charDataStart = charBuf;
310  charDataStart, rawFileLen, &signatureLength, &status);
311  if (U_FAILURE(status)) {
312  fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_detectUnicodeSignature()\n",
314  return;
315  }
316  if(encoding!=NULL ){
317  charDataStart += signatureLength;
318  rawFileLen -= signatureLength;
319  }
320 
321  //
322  // Open a converter to take the file to UTF-16
323  //
324  UConverter* conv;
326  if (U_FAILURE(status)) {
327  fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_open()\n", u_errorName(status));
328  return;
329  }
330 
331  //
332  // Convert the file data to UChar.
333  // Preflight first to determine required buffer size.
334  //
335  uint32_t destCap = ucnv_toUChars(conv,
336  NULL, // dest,
337  0, // destCapacity,
338  charDataStart,
339  rawFileLen,
340  &status);
342  fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
343  return;
344  };
345 
347  ucharBuf = (UChar *)realloc(ucharBuf, (destCap+1) * sizeof(UChar));
349  ucharBuf, // dest,
350  destCap+1,
351  charDataStart,
352  rawFileLen,
353  &status);
354  if (U_FAILURE(status)) {
355  fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
356  return;
357  };
358  ucnv_close(conv);
359 
360  //
361  // Successful conversion. Set the global size variables so that
362  // the rest of the processing will proceed for this file.
363  //
364  fileLen = destCap;
365 }
366 
367 
368 
369 
370 
371 //------------------------------------------------------------------------------------------
372 //
373 // nextLine Advance the line index variables, starting at the
374 // specified position in the input file buffer, by
375 // scanning forward until the next end-of-line.
376 //
377 // Need to take into account all of the possible Unicode
378 // line ending sequences.
379 //
380 //------------------------------------------------------------------------------------------
381 void nextLine(int startPos) {
382  if (startPos == 0) {
383  lineNum = 0;
384  } else {
385  lineNum++;
386  }
387  lineStart = lineEnd = startPos;
388 
389  for (;;) {
390  if (lineEnd >= fileLen) {
391  return;
392  }
393  UChar c = ucharBuf[lineEnd];
394  lineEnd++;
395  if (c == 0x0a || // Line Feed
396  c == 0x0c || // Form Feed
397  c == 0x0d || // Carriage Return
398  c == 0x85 || // Next Line
399  c == 0x2028 || // Line Separator
400  c == 0x2029) // Paragraph separator
401  {
402  break;
403  }
404  }
405 
406  // Check for CR/LF sequence, and advance over the LF if we're in the middle of one.
407  if (lineEnd < fileLen &&
408  ucharBuf[lineEnd-1] == 0x0d &&
409  ucharBuf[lineEnd] == 0x0a)
410  {
411  lineEnd++;
412  }
413 }
414 
415 
416 //------------------------------------------------------------------------------------------
417 //
418 // printMatch Called when a matching line has been located.
419 // Print out the line from the file with the match, after
420 // converting it back to the default code page.
421 //
422 //------------------------------------------------------------------------------------------
423 void printMatch() {
424  char buf[2000];
426 
427  // If we haven't already created a converter for output, do it now.
428  if (outConverter == 0) {
430  if (U_FAILURE(status)) {
431  fprintf(stderr, "ugrep: Error opening default converter: \"%s\"\n",
433  exit(-1);
434  }
435  };
436 
437  // Convert the line to be printed back to the default 8 bit code page.
438  // If the line is too long for our buffer, just truncate it.
440  buf, // destination buffer for conversion
441  sizeof(buf), // capacity of destination buffer
442  &ucharBuf[lineStart], // Input to conversion
443  lineEnd-lineStart, // number of UChars to convert
444  &status);
445  buf[sizeof(buf)-1] = 0; // Add null for use in case of too long lines.
446  // The converter null-terminates its output unless
447  // the buffer completely fills.
448 
449  if (displayFileName) {
450  printf("%s:", fileName);
451  }
452  if (displayLineNum) {
453  printf("%d:", lineNum);
454  }
455  printf("%s", buf);
456 }
457 
#define empty
Definition: aptex-macros.h:52
#define name
#define free(a)
Definition: decNumber.cpp:310
#define fopen
Definition: xxstdio.h:21
#define fread
Definition: xxstdio.h:25
#define fseek
Definition: xxstdio.h:30
#define ftell
Definition: xxstdio.h:31
int strcmp()
Definition: coll.cpp:143
int printf()
static double conv
Definition: dvicore.c:53
#define s
Definition: afcover.h:80
#define c(n)
Definition: gpos-common.c:150
#define SEEK_SET
Definition: jmemansi.c:26
unsigned char UChar
Definition: bzip2.c:163
#define NULL
Definition: ftobjs.h:61
#define SEEK_END
Definition: ftzconf.h:251
void exit()
unsigned int uint32_t
Definition: stdint.h:80
signed int int32_t
Definition: stdint.h:77
#define buf
#define fclose
Definition: debug.h:100
#define fprintf
Definition: mendex.h:64
int strncmp()
#define realloc
Definition: glob.c:206
#define status
C API: Unicode string handling functions.
C++ API: Regular Expressions.
C_Op * compile(Source *s)
int32_t offset
Definition: parseerr.h:76
Definition: filedef.h:30
Definition: dvips.h:235
#define FILE
Definition: t1stdio.h:34
*job_name strlen((char *) job_name) - 4)
C API: Initialize and clean up ICU.
C API: Character conversion.
int firstFileNum
Definition: ugrep.cpp:44
int lineEnd
Definition: ugrep.cpp:65
void readFile(const char *name)
Definition: ugrep.cpp:268
const char * fileName
Definition: ugrep.cpp:52
int main(int argc, const char **argv)
Definition: ugrep.cpp:95
UChar * ucharBuf
Definition: ugrep.cpp:55
char * charBuf
Definition: ugrep.cpp:58
void processOptions(int argc, const char **argv)
Definition: ugrep.cpp:178
void nextLine(int start)
Definition: ugrep.cpp:381
UBool displayFileName
Definition: ugrep.cpp:45
int lineNum
Definition: ugrep.cpp:66
UConverter * outConverter
Definition: ugrep.cpp:72
UBool displayLineNum
Definition: ugrep.cpp:46
void printUsage()
Definition: ugrep.cpp:249
void printMatch()
Definition: ugrep.cpp:423
int fileLen
Definition: ugrep.cpp:53
int lineStart
Definition: ugrep.cpp:64
int8_t UBool
Definition: umachine.h:269
#define ucnv_toUChars
Definition: urename.h:706
#define u_errorName
Definition: urename.h:226
#define ucnv_close
Definition: urename.h:625
#define ucnv_open
Definition: urename.h:687
#define ucnv_detectUnicodeSignature
Definition: urename.h:636
#define ucnv_fromUChars
Definition: urename.h:648
#define u_cleanup
Definition: urename.h:221
@ start
Definition: preamble.c:52
Basic definitions for ICU, for both C and C++ APIs.
UErrorCode
Definition: utypes.h:431
@ U_BUFFER_OVERFLOW_ERROR
Definition: utypes.h:481
@ U_ZERO_ERROR
Definition: utypes.h:465
#define U_FAILURE(x)
Definition: utypes.h:735
#define argv
Definition: xmain.c:270
#define argc
Definition: xmain.c:269