"Fossies" - the Fresh Open Source Software Archive

Member "duff-0.5.2/src/duffdriver.c" (28 Jan 2012, 11224 Bytes) of package /linux/privat/old/duff-0.5.2.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file.

    1 /*
    2  * duff - Duplicate file finder
    3  * Copyright (c) 2005 Camilla Berglund <elmindreda@elmindreda.org>
    4  *
    5  * This software is provided 'as-is', without any express or implied
    6  * warranty. In no event will the authors be held liable for any
    7  * damages arising from the use of this software.
    8  *
    9  * Permission is granted to anyone to use this software for any
   10  * purpose, including commercial applications, and to alter it and
   11  * redistribute it freely, subject to the following restrictions:
   12  *
   13  *  1. The origin of this software must not be misrepresented; you
   14  *     must not claim that you wrote the original software. If you use
   15  *     this software in a product, an acknowledgment in the product
   16  *     documentation would be appreciated but is not required.
   17  *
   18  *  2. Altered source versions must be plainly marked as such, and
   19  *     must not be misrepresented as being the original software.
   20  *
   21  *  3. This notice may not be removed or altered from any source
   22  *     distribution.
   23  */
   24 
   25 #if HAVE_CONFIG_H
   26 #include "config.h"
   27 #endif
   28 
   29 #if HAVE_SYS_TYPES_H
   30 #include <sys/types.h>
   31 #endif
   32 
   33 #if HAVE_SYS_STAT_H
   34 #include <sys/stat.h>
   35 #endif
   36 
   37 #if HAVE_INTTYPES_H
   38 #include <inttypes.h>
   39 #elif HAVE_STDINT_H
   40 #include <stdint.h>
   41 #endif
   42 
   43 #if HAVE_ERRNO_H
   44 #include <errno.h>
   45 #endif
   46 
   47 #if HAVE_UNISTD_H
   48 #include <unistd.h>
   49 #endif
   50 
   51 #if HAVE_STDIO_H
   52 #include <stdio.h>
   53 #endif
   54 
   55 #if HAVE_STRING_H
   56 #include <string.h>
   57 #endif
   58 
   59 #if HAVE_STDARG_H
   60 #include <stdarg.h>
   61 #endif
   62 
   63 #if HAVE_STDLIB_H
   64 #include <stdlib.h>
   65 #endif
   66 
   67 #if HAVE_DIRENT_H
   68   #include <dirent.h>
   69   #define NAMLEN(dirent) strlen((dirent)->d_name)
   70 #else
   71   #define dirent direct
   72   #define NAMLEN(dirent) (dirent)->d_namlen
   73 #if HAVE_SYS_NDIR_H
   74   #include <sys/ndir.h>
   75   #endif
   76     #if HAVE_SYS_DIR_H
   77       #include <sys/dir.h>
   78     #endif
   79   #if HAVE_NDIR_H
   80     #include <ndir.h>
   81   #endif
   82 #endif
   83 
   84 #include "duffstring.h"
   85 #include "duff.h"
   86 
   87 /* The number of buckets to use for files.
   88  */
   89 #define BUCKET_COUNT (1 << HASH_BITS)
   90 
   91 /* Calculates the bucket index corresponding to the specified file size.
   92  */
   93 #define BUCKET_INDEX(size) ((size) & (BUCKET_COUNT - 1))
   94 
   95 /* These flags are defined and documented in duff.c.
   96  */
   97 extern int follow_links_mode;
   98 extern int all_files_flag;
   99 extern int null_terminate_flag;
  100 extern int recursive_flag;
  101 extern int ignore_empty_flag;
  102 extern int quiet_flag;
  103 extern int physical_flag;
  104 extern int excess_flag;
  105 extern const char* header_format;
  106 extern int header_uses_digest;
  107 
  108 /* Typedefs for the structs below.
  109  */
  110 typedef struct Dir Dir;
  111 typedef struct DirList DirList;
  112 
  113 /* Represents a single physical directory.
  114  */
  115 struct Dir
  116 {
  117   dev_t device;
  118   ino_t inode;
  119 };
  120 
  121 /* Represents a list of physical directories.
  122  */
  123 struct DirList
  124 {
  125   Dir* dirs;
  126   size_t allocated;
  127   size_t available;
  128 };
  129 
  130 /* List of traversed physical directories, used to avoid loops.
  131  */
  132 static DirList recorded_dirs;
  133 
  134 /* Buckets of lists of collected files.
  135  */
  136 static FileList buckets[BUCKET_COUNT];
  137 
  138 /* These functions are documented below, where they are defined.
  139  */
  140 static int stat_path(const char* path, struct stat* sb, int depth);
  141 static int has_recorded_directory(dev_t device, ino_t inode);
  142 static void record_directory(dev_t device, ino_t inode);
  143 static void process_directory(const char* path,
  144                               const struct stat* sb,
  145                   int depth);
  146 static void process_file(const char* path, struct stat* sb);
  147 static void process_path(const char* path, int depth);
  148 static void report_cluster(const FileList* cluster, unsigned int index);
  149 static void process_clusters(void);
  150 
  151 /* Initializes the driver, processes the specified arguments and reports the
  152  * clusters found.
  153  */
  154 void process_args(int argc, char** argv)
  155 {
  156   size_t i;
  157 
  158   memset(&recorded_dirs, 0, sizeof(DirList));
  159 
  160   for (i = 0;  i < BUCKET_COUNT;  i++)
  161     init_file_list(&buckets[i]);
  162 
  163   if (argc)
  164   {
  165     /* Read file names from command line */
  166     for (i = 0;  i < argc;  i++)
  167     {
  168       kill_trailing_slashes(argv[i]);
  169       process_path(argv[i], 0);
  170     }
  171   }
  172   else
  173   {
  174     char* path;
  175 
  176     /* Read file names from stdin */
  177     while ((path = read_path(stdin)))
  178     {
  179       kill_trailing_slashes(path);
  180       process_path(path, 0);
  181       free(path);
  182     }
  183   }
  184 
  185   process_clusters();
  186 
  187   for (i = 0;  i < BUCKET_COUNT;  i++)
  188     free_file_list(&buckets[i]);
  189 
  190   free(recorded_dirs.dirs);
  191   memset(&recorded_dirs, 0, sizeof(DirList));
  192 }
  193 
  194 /* Stat:s a file according to the specified options.
  195  */
  196 static int stat_path(const char* path, struct stat* sb, int depth)
  197 {
  198   if (*path == '\0')
  199     return -1;
  200 
  201   if (lstat(path, sb) != 0)
  202   {
  203     if (!quiet_flag)
  204       warning("%s: %s", path, strerror(errno));
  205 
  206     return -1;
  207   }
  208 
  209   if (S_ISLNK(sb->st_mode))
  210   {
  211     if (follow_links_mode == ALL_SYMLINKS ||
  212         (depth == 0 && follow_links_mode == ARG_SYMLINKS))
  213     {
  214       if (stat(path, sb) != 0)
  215       {
  216     if (!quiet_flag)
  217       warning("%s: %s", path, strerror(errno));
  218 
  219     return -1;
  220       }
  221 
  222       if (S_ISDIR(sb->st_mode))
  223     return -1;
  224     }
  225     else
  226       return -1;
  227   }
  228 
  229   return 0;
  230 }
  231 
  232 /* Returns true if the directory has already been recorded.
  233  * TODO: Implement a more efficient data structure.
  234  */
  235 static int has_recorded_directory(dev_t device, ino_t inode)
  236 {
  237   size_t i;
  238   const Dir* dirs = recorded_dirs.dirs;
  239 
  240   for (i = 0;  i < recorded_dirs.allocated;  i++)
  241   {
  242     if (dirs[i].device == device && dirs[i].inode == inode)
  243       return 1;
  244   }
  245 
  246   return 0;
  247 }
  248 
  249 /* Records the specified directory.
  250  * TODO: Implement a more efficient data structure.
  251  */
  252 static void record_directory(dev_t device, ino_t inode)
  253 {
  254   if (recorded_dirs.allocated == recorded_dirs.available)
  255   {
  256     size_t count;
  257 
  258     if (recorded_dirs.available)
  259       count = recorded_dirs.available * 2;
  260     else
  261       count = 1024;
  262 
  263     recorded_dirs.dirs = realloc(recorded_dirs.dirs, count * sizeof(Dir));
  264     if (recorded_dirs.dirs == NULL)
  265       error(_("Out of memory"));
  266 
  267     recorded_dirs.available = count;
  268   }
  269 
  270   recorded_dirs.dirs[recorded_dirs.allocated].device = device;
  271   recorded_dirs.dirs[recorded_dirs.allocated].inode = inode;
  272   recorded_dirs.allocated++;
  273 }
  274 
  275 /* Recurses into a directory, collecting all or all non-hidden files,
  276  * according to the specified options.
  277  */
  278 static void process_directory(const char* path,
  279                               const struct stat* sb,
  280                   int depth)
  281 {
  282   DIR* dir;
  283   struct dirent* dir_entry;
  284   char* child_path;
  285   const char* name;
  286 
  287   if (has_recorded_directory(sb->st_dev, sb->st_ino))
  288     return;
  289 
  290   record_directory(sb->st_dev, sb->st_ino);
  291 
  292   dir = opendir(path);
  293   if (!dir)
  294   {
  295     if (!quiet_flag)
  296       warning("%s: %s", path, strerror(errno));
  297 
  298     return;
  299   }
  300 
  301   while ((dir_entry = readdir(dir)))
  302   {
  303     name = dir_entry->d_name;
  304     if (name[0] == '.')
  305     {
  306       if (!all_files_flag)
  307     continue;
  308 
  309       if (strcmp(name, ".") == 0 || strcmp(name, "..") == 0)
  310     continue;
  311     }
  312 
  313     if (asprintf(&child_path, "%s/%s", path, name) > 0)
  314     {
  315       process_path(child_path, depth);
  316       free(child_path);
  317     }
  318   }
  319 
  320   closedir(dir);
  321 }
  322 
  323 /* Processes a single file.
  324  */
  325 static void process_file(const char* path, struct stat* sb)
  326 {
  327   if (sb->st_size == 0)
  328   {
  329     if (ignore_empty_flag)
  330       return;
  331   }
  332 
  333   /* NOTE: Check for duplicate arguments? */
  334 
  335   if (physical_flag)
  336   {
  337     /* TODO: Make this less pessimal */
  338 
  339     size_t i, bucket = BUCKET_INDEX(sb->st_size);
  340 
  341     for (i = 0;  i < buckets[bucket].allocated;  i++)
  342     {
  343       if (buckets[bucket].files[i].device == sb->st_dev &&
  344           buckets[bucket].files[i].inode == sb->st_ino)
  345       {
  346         return;
  347       }
  348     }
  349   }
  350 
  351   init_file(alloc_file(&buckets[BUCKET_INDEX(sb->st_size)]), path, sb);
  352 }
  353 
  354 /* Processes a path name according to its type, whether from the command line or
  355  * from directory recursion.
  356  *
  357  * This function calls process_file and process_directory as needed.
  358  */
  359 static void process_path(const char* path, int depth)
  360 {
  361   mode_t mode;
  362   struct stat sb;
  363 
  364   if (stat_path(path, &sb, depth) != 0)
  365     return;
  366 
  367   mode = sb.st_mode & S_IFMT;
  368   switch (mode)
  369   {
  370     case S_IFREG:
  371     {
  372       process_file(path, &sb);
  373       break;
  374     }
  375 
  376     case S_IFDIR:
  377     {
  378       if (recursive_flag)
  379       {
  380     process_directory(path, &sb, depth + 1);
  381         break;
  382       }
  383 
  384       /* FALLTHROUGH */
  385     }
  386 
  387     default:
  388     {
  389       if (!quiet_flag)
  390       {
  391     switch (mode)
  392     {
  393       case S_IFLNK:
  394         warning(_("%s is a symbolic link; skipping"), path);
  395         break;
  396       case S_IFIFO:
  397         warning(_("%s is a named pipe; skipping"), path);
  398         break;
  399           case S_IFBLK:
  400         warning(_("%s is a block device; skipping"), path);
  401         break;
  402       case S_IFCHR:
  403         warning(_("%s is a character device; skipping"), path);
  404         break;
  405       case S_IFDIR:
  406         warning(_("%s is a directory; skipping"), path);
  407         break;
  408       case S_IFSOCK:
  409         warning(_("%s is a socket; skipping"), path);
  410         break;
  411       default:
  412         error(_("This cannot happen"));
  413     }
  414       }
  415     }
  416   }
  417 }
  418 
  419 /* Reports a cluster to stdout, according to the specified options.
  420  */
  421 static void report_cluster(const FileList* cluster, unsigned int index)
  422 {
  423   size_t i;
  424   File* files = cluster->files;
  425 
  426   if (excess_flag)
  427   {
  428     /* Report all but the first file in the cluster */
  429     for (i = 1;  i < cluster->allocated;  i++)
  430     {
  431       printf("%s", files[i].path);
  432       putchar(get_field_terminator());
  433     }
  434   }
  435   else
  436   {
  437     /* Print header and report all files in the cluster */
  438 
  439     if (*header_format != '\0')
  440     {
  441       if (header_uses_digest)
  442         generate_file_digest(files);
  443 
  444       print_cluster_header(header_format,
  445                cluster->allocated,
  446                index,
  447                files->size,
  448                files->digest);
  449 
  450       putchar(get_field_terminator());
  451     }
  452 
  453     for (i = 0;  i < cluster->allocated;  i++)
  454     {
  455       printf("%s", files[i].path);
  456       putchar(get_field_terminator());
  457     }
  458   }
  459 }
  460 
  461 /* Finds and reports all duplicate clusters in each bucket of collected files.
  462  */
  463 static void process_clusters(void)
  464 {
  465   size_t i, j, first, second, index = 1;
  466   FileList duplicates;
  467 
  468   init_file_list(&duplicates);
  469 
  470   for (i = 0;  i < BUCKET_COUNT;  i++)
  471   {
  472     File* files = buckets[i].files;
  473 
  474     for (first = 0;  first < buckets[i].allocated;  first++)
  475     {
  476       if (files[first].status == INVALID ||
  477           files[first].status == REPORTED)
  478       {
  479         continue;
  480       }
  481 
  482       for (second = first + 1;  second < buckets[i].allocated;  second++)
  483       {
  484         if (files[second].status == INVALID ||
  485             files[second].status == REPORTED)
  486         {
  487             continue;
  488         }
  489 
  490         if (compare_files(&files[first], &files[second]) == 0)
  491         {
  492           if (duplicates.allocated == 0)
  493           {
  494             *alloc_file(&duplicates) = files[first];
  495             files[first].status = REPORTED;
  496           }
  497 
  498           *alloc_file(&duplicates) = files[second];
  499           files[second].status = REPORTED;
  500         }
  501         else
  502         {
  503           if (files[first].status == INVALID)
  504             break;
  505         }
  506       }
  507 
  508       if (duplicates.allocated > 0)
  509       {
  510         report_cluster(&duplicates, index);
  511         empty_file_list(&duplicates);
  512 
  513         index++;
  514       }
  515     }
  516 
  517     for (j = 0;  j < buckets[i].allocated;  j++)
  518       free_file(&files[j]);
  519   }
  520 
  521   free_file_list(&duplicates);
  522 }
  523