"Fossies" - the Fresh Open Source Software Archive

Member "xdelta3-3.0.11/testing/xdelta3-regtest.py" (24 Mar 2015, 35091 Bytes) of package /linux/misc/xdelta3-3.0.11.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Python source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file.

    1 #!/usr/bin/python2.6
    2 # xdelta 3 - delta compression tools and library
    3 # Copyright (C) 2003, 2006, 2007, 2008.  Joshua P. MacDonald
    4 #
    5 #  This program is free software; you can redistribute it and/or modify
    6 #  it under the terms of the GNU General Public License as published by
    7 #  the Free Software Foundation; either version 2 of the License, or
    8 #  (at your option) any later version.
    9 #
   10 #  This program is distributed in the hope that it will be useful,
   11 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
   12 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   13 #  GNU General Public License for more details.
   14 #
   15 #  You should have received a copy of the GNU General Public License
   16 #  along with this program; if not, write to the Free Software
   17 #  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   18 
   19 # TODO: test 1.5 vs. greedy
   20 
   21 import os, sys, math, re, time, types, array, random
   22 import xdelta3
   23 
   24 #RCSDIR = '/mnt/polaroid/Polaroid/orbit_linux/home/jmacd/PRCS'
   25 #RCSDIR = '/tmp/PRCS_read_copy'
   26 #SAMPLEDIR = "/tmp/WESNOTH_tmp/diff"
   27 
   28 #RCSDIR = 'G:/jmacd/PRCS_copy'
   29 #SAMPLEDIR = "C:/sample_data/Wesnoth/tar"
   30 
   31 RCSDIR = '/Users/jmacd/src/ftp.kernel.org'
   32 SAMPLEDIR = '/Users/jmacd/src/xdelta3/linux'
   33 
   34 #
   35 MIN_SIZE       = 0
   36 
   37 TIME_TOO_SHORT = 0.050
   38 
   39 SKIP_TRIALS    = 2
   40 MIN_TRIALS     = 3
   41 MAX_TRIALS     = 15
   42 
   43 # 10 = fast 1.5 = slow
   44 MIN_STDDEV_PCT = 1.5
   45 
   46 # How many results per round
   47 MAX_RESULTS = 500
   48 TEST_ROUNDS = 10
   49 KEEP_P = (0.5)
   50 
   51 # For RCS testing, what percent to select
   52 FILE_P = (0.50)
   53 
   54 # For run-speed tests
   55 MIN_RUN = 1000 * 1000 * 1
   56 MAX_RUN = 1000 * 1000 * 10
   57 
   58 # Testwide defaults
   59 ALL_ARGS = [
   60     '-q'  # '-vv'
   61     ]
   62 
   63 # The first 7 args go to -C
   64 SOFT_CONFIG_CNT = 7
   65 
   66 CONFIG_ORDER = [ 'large_look',
   67                  'large_step',
   68                  'small_look',
   69                  'small_chain',
   70                  'small_lchain',
   71                  'max_lazy',
   72                  'long_enough',
   73 
   74                  # > SOFT_CONFIG_CNT
   75                  'nocompress',
   76                  'winsize',
   77                  'srcwinsize',
   78                  'sprevsz',
   79                  'iopt',
   80                  'djw',
   81                  'altcode',
   82                  ]
   83 
   84 CONFIG_ARGMAP = {
   85     'winsize'    : '-W',
   86     'srcwinsize' : '-B',
   87     'sprevsz'    : '-P',
   88     'iopt'       : '-I',
   89     'nocompress' : '-N',
   90     'djw'        : '-Sdjw',
   91     'altcode'    : '-T',
   92     }
   93 
   94 def INPUT_SPEC(rand):
   95     return {
   96 
   97     # Time/space costs:
   98 
   99     # -C 1,2,3,4,5,6,7
  100     'large_look' : lambda d: rand.choice([9, 10, 11, 12]),
  101     'large_step' : lambda d: rand.choice([25, 26, 27, 28, 29, 30]),
  102     'small_look'   : lambda d: rand.choice([4]),
  103     'small_chain'  : lambda d: rand.choice([1]),
  104     'small_lchain' : lambda d: rand.choice([1]),
  105     'max_lazy'     : lambda d: rand.choice([4, 5, 6, 7, 8, 9, 10 ]),
  106 
  107     # Note: long_enough only refers to small matching and has no effect if
  108     # small_chain == 1.
  109     'long_enough'  : lambda d: rand.choice([4]),
  110 
  111     # -N
  112     'nocompress'   : lambda d: rand.choice(['false']),
  113 
  114     # -T
  115     'altcode'      : lambda d: rand.choice(['false']),
  116 
  117     # -S djw
  118     'djw'          : lambda d: rand.choice(['false']),
  119 
  120     # Memory costs:
  121 
  122     # -W
  123     'winsize'      : lambda d: 8 * (1<<20),
  124 
  125     # -B
  126     'srcwinsize'   : lambda d: 64 * (1<<20),
  127 
  128     # -I 0 is unlimited
  129     'iopt'         : lambda d: 0,
  130 
  131     # -P only powers of two
  132     'sprevsz'      : lambda d: rand.choice([x * (1<<16) for x in [4]]),
  133   }
  134 #end
  135 
  136 #
  137 TMPDIR = '/tmp/xd3regtest.%d' % os.getpid()
  138 
  139 RUNFILE = os.path.join(TMPDIR, 'run')
  140 DFILE   = os.path.join(TMPDIR, 'output')
  141 RFILE   = os.path.join(TMPDIR, 'recon')
  142 CMPTMP1 = os.path.join(TMPDIR, 'cmptmp1')
  143 CMPTMP2 = os.path.join(TMPDIR, 'cmptmp2')
  144 
  145 HEAD_STATE = 0
  146 BAR_STATE  = 1
  147 REV_STATE  = 2
  148 DATE_STATE = 3
  149 
  150 #
  151 IGNORE_FILENAME  = re.compile('.*\\.(gif|jpg).*')
  152 
  153 # rcs output
  154 RE_TOTREV  = re.compile('total revisions: (\\d+)')
  155 RE_BAR     = re.compile('----------------------------')
  156 RE_REV     = re.compile('revision (.+)')
  157 RE_DATE    = re.compile('date: ([^;]+);.*')
  158 # xdelta output
  159 RE_HDRSZ   = re.compile('VCDIFF header size: +(\\d+)')
  160 RE_EXTCOMP = re.compile('XDELTA ext comp.*')
  161 
  162 def c2str(c):
  163     return ' '.join(['%s' % x for x in c])
  164 #end
  165 
  166 def SumList(l):
  167     return reduce(lambda x,y: x+y, l)
  168 #end
  169 
  170 # returns (total, mean, stddev, q2 (median),
  171 #          (q3-q1)/2 ("semi-interquartile range"), max-min (spread))
  172 class StatList:
  173     def __init__(self,l,desc):
  174         cnt = len(l)
  175         assert(cnt > 1)
  176         l.sort()
  177         self.cnt    = cnt
  178         self.l      = l
  179         self.total  = SumList(l)
  180         self.mean   = self.total / float(self.cnt)
  181         self.s      = math.sqrt(SumList([(x-self.mean) * 
  182                                          (x - self.mean) for x in l]) / 
  183                                 float(self.cnt-1))
  184         self.q0     = l[0]
  185         self.q1     = l[int(self.cnt/4.0+0.5)]
  186         self.q2     = l[int(self.cnt/2.0+0.5)]
  187         self.q3     = l[min(self.cnt-1,int((3.0*self.cnt)/4.0+0.5))]
  188         self.q4     = l[self.cnt-1]
  189         self.siqr   = (self.q3-self.q1)/2.0;
  190         self.spread = (self.q4-self.q0)
  191         if len(l) == 1:
  192             self.str = '%s %s' % (desc, l[0])
  193         else:
  194             self.str = '%s mean %.1f: 25%-ile %d %d %d %d %d' % \
  195                 (desc, self.mean, self.q0, self.q1, self.q2, self.q3, self.q4)
  196     #end
  197 #end
  198 
  199 def RunCommand(args, ok = [0]):
  200     #print 'run command %s' % (' '.join(args))
  201     p = os.spawnvp(os.P_WAIT, args[0], args)
  202     if p not in ok:
  203         raise CommandError(args, 'exited %d' % p)
  204     #end
  205 #end
  206 
  207 def RunCommandIO(args,infn,outfn):
  208     p = os.fork()
  209     if p == 0:
  210         os.dup2(os.open(infn,os.O_RDONLY),0)
  211         os.dup2(os.open(outfn,os.O_CREAT|os.O_TRUNC|os.O_WRONLY),1)
  212         os.execvp(args[0], args)
  213     else:
  214         s = os.waitpid(p,0)
  215         o = os.WEXITSTATUS(s[1])
  216         if not os.WIFEXITED(s[1]) or o != 0:
  217             raise CommandError(args, 'exited %d' % o)
  218         #end
  219     #end
  220 #end
  221 
  222 class TimedTest:
  223     def __init__(self, target, source, runnable,
  224                  skip_trials = SKIP_TRIALS,
  225                  min_trials = MIN_TRIALS,
  226                  max_trials = MAX_TRIALS,
  227                  min_stddev_pct = MIN_STDDEV_PCT):
  228         self.target = target
  229         self.source = source
  230         self.runnable = runnable
  231 
  232         self.skip_trials = skip_trials
  233         self.min_trials = min(min_trials, max_trials)
  234         self.max_trials = max_trials
  235         self.min_stddev_pct = min_stddev_pct
  236 
  237         self.encode_time = self.DoTest(DFILE,
  238                                        lambda x: x.Encode(self.target, 
  239                                                           self.source, DFILE))
  240         self.encode_size = runnable.EncodeSize(DFILE)
  241 
  242         self.decode_time = self.DoTest(RFILE,
  243                                        lambda x: x.Decode(DFILE, 
  244                                                           self.source, RFILE),
  245                                        )
  246         runnable.Verify(self.target, RFILE)
  247     #end
  248 
  249     def DoTest(self, fname, func):
  250         trials   = 0
  251         measured = []
  252 
  253         while 1:
  254             try:
  255                 os.remove(fname)
  256             except OSError:
  257                 pass
  258 
  259             start_time  = time.time()
  260             start_clock = time.clock()
  261 
  262             func(self.runnable)
  263 
  264             total_clock = (time.clock() - start_clock)
  265             total_time  = (time.time() - start_time)
  266 
  267             elap_time  = max(total_time,  0.0000001)
  268             elap_clock = max(total_clock, 0.0000001)
  269 
  270             trials = trials + 1
  271 
  272             # skip some of the first trials
  273             if trials > self.skip_trials:
  274                 measured.append((elap_clock, elap_time))
  275                 #print 'measurement total: %.1f ms' % (total_time * 1000.0)
  276 
  277             # at least so many
  278             if trials < (self.skip_trials + self.min_trials):
  279                 #print 'continue: need more trials: %d' % trials
  280                 continue
  281 
  282             # compute %variance
  283             done = 0
  284             if self.skip_trials + self.min_trials <= 2:
  285                 measured = measured + measured;
  286                 done = 1
  287             #end
  288 
  289             time_stat = StatList([x[1] for x in measured], 'elap time')
  290             sp = float(time_stat.s) / float(time_stat.mean)
  291 
  292             # what if MAX_TRIALS is exceeded?
  293             too_many = (trials - self.skip_trials) >= self.max_trials
  294             good = (100.0 * sp) < self.min_stddev_pct
  295             if done or too_many or good:
  296                 trials = trials - self.skip_trials
  297                 if not done and not good:
  298                     #print 'too many trials: %d' % trials
  299                     pass
  300                 #clock = StatList([x[0] for x in measured], 'elap clock')
  301                 return time_stat
  302             #end
  303         #end
  304     #end
  305 #end
  306 
  307 def Decimals(start, end):
  308     l = []
  309     step = start
  310     while 1:
  311         r = range(step, step * 10, step)
  312         l = l + r
  313         if step * 10 >= end:
  314             l.append(step * 10)
  315             break
  316         step = step * 10
  317     return l
  318 #end
  319 
  320 # This tests the raw speed of 0-byte inputs
  321 def RunSpeedTest():
  322     for L in Decimals(MIN_RUN, MAX_RUN):
  323         SetFileSize(RUNFILE, L)
  324 
  325         trx = TimedTest(RUNFILE, None, Xdelta3Runner(['-W', str(1<<20)]))
  326         ReportSpeed(L, trx, '1MB ')
  327 
  328         trx = TimedTest(RUNFILE, None, Xdelta3Runner(['-W', str(1<<19)]))
  329         ReportSpeed(L, trx, '512k')
  330 
  331         trx = TimedTest(RUNFILE, None, Xdelta3Runner(['-W', str(1<<18)]))
  332         ReportSpeed(L, trx, '256k')
  333 
  334         trm = TimedTest(RUNFILE, None, Xdelta3Mod1(RUNFILE))
  335         ReportSpeed(L, trm, 'swig')
  336 
  337         trg = TimedTest(RUNFILE, None, GzipRun1())
  338         ReportSpeed(L,trg,'gzip')
  339     #end
  340 #end
  341 
  342 def SetFileSize(F,L):
  343     fd = os.open(F, os.O_CREAT | os.O_WRONLY)
  344     os.ftruncate(fd,L)
  345     assert os.fstat(fd).st_size == L
  346     os.close(fd)
  347 #end
  348 
  349 def ReportSpeed(L,tr,desc):
  350     print '%s run length %u: size %u: time %.3f ms: decode %.3f ms' % \
  351           (desc, L,
  352            tr.encode_size,
  353            tr.encode_time.mean * 1000.0,
  354            tr.decode_time.mean * 1000.0)
  355 #end
  356 
  357 class Xdelta3RunClass:
  358     def __init__(self, extra):
  359         self.extra = extra
  360     #end
  361 
  362     def __str__(self):
  363         return ' '.join(self.extra)
  364     #end
  365 
  366     def New(self):
  367         return Xdelta3Runner(self.extra)
  368     #end
  369 #end
  370 
  371 class Xdelta3Runner:
  372     # Use "forkexec" to get special command-line only features like
  373     # external compression support.
  374     def __init__(self, extra, forkexec=False):
  375         self.forkexec = forkexec
  376         self.extra = extra
  377     #end
  378 
  379     def Encode(self, target, source, output):
  380         args = (ALL_ARGS +
  381                 self.extra +
  382                 ['-e'])
  383         if source:
  384             args.append('-s')
  385             args.append(source)
  386         #end
  387         args = args + [target, output]
  388         self.Main(args)
  389     #end
  390 
  391     def Decode(self, input, source, output):
  392         args = (ALL_ARGS +
  393                 ['-d'])
  394         if source:
  395             args.append('-s')
  396             args.append(source)
  397         #end
  398         args = args + [input, output]
  399         self.Main(args)
  400     #end
  401 
  402     def Verify(self, target, recon):
  403         if target[-3:] == ".gz":
  404             RunCommandIO(('gzip', '-dc'), target, CMPTMP1)
  405             RunCommandIO(('gzip', '-dc'), recon, CMPTMP2)
  406             RunCommand(('cmp', CMPTMP1, CMPTMP2))
  407         else:
  408             RunCommand(('cmp', target, recon))
  409     #end
  410 
  411     def EncodeSize(self, output):
  412         return os.stat(output).st_size
  413     #end
  414 
  415     def Main(self, args):
  416         try:
  417             if self.forkexec:
  418                 RunCommand(['../xdelta3'] + args)
  419             else:
  420                 xdelta3.xd3_main_cmdline(args)
  421         except Exception, e:
  422             raise CommandError(args, "xdelta3.main exception: %s" % e)
  423         #end
  424     #end
  425 #end
  426 
  427 class Xdelta3Mod1:
  428     def __init__(self, file):
  429         self.target_data = open(file, 'r').read()
  430     #end
  431 
  432     def Encode(self, ignore1, ignore2, ignore3):
  433         r1, encoded = xdelta3.xd3_encode_memory(self.target_data, None, 1000000, 1<<10)
  434         if r1 != 0:
  435             raise CommandError('memory', 'encode failed: %s' % r1)
  436         #end
  437         self.encoded = encoded
  438     #end
  439 
  440     def Decode(self, ignore1, ignore2, ignore3):
  441         r2, data1 = xdelta3.xd3_decode_memory(self.encoded, None, len(self.target_data))
  442         if r2 != 0:
  443             raise CommandError('memory', 'decode failed: %s' % r1)
  444         #end
  445         self.decoded = data1
  446     #end
  447 
  448     def Verify(self, ignore1, ignore2):
  449         if self.target_data != self.decoded:
  450             raise CommandError('memory', 'bad decode')
  451         #end
  452     #end
  453 
  454     def EncodeSize(self, ignore1):
  455         return len(self.encoded)
  456     #end
  457 #end
  458 
  459 class GzipRun1:
  460     def Encode(self, target, source, output):
  461         assert source == None
  462         RunCommandIO(['gzip', '-cf'], target, output)
  463     #end
  464 
  465     def Decode(self, input, source, output):
  466         assert source == None
  467         RunCommandIO(['gzip', '-dcf'], input, output)
  468     #end
  469 
  470     def Verify(self, target, recon):
  471         RunCommand(('cmp', target, recon))
  472     #end
  473 
  474     def EncodeSize(self, output):
  475         return os.stat(output).st_size
  476     #end
  477 #end
  478 
  479 class Xdelta1RunClass:
  480     def __str__(self):
  481         return 'xdelta1'
  482     #end
  483 
  484     def New(self):
  485         return Xdelta1Runner()
  486     #end
  487 #end
  488 
  489 class Xdelta1Runner:
  490     def Encode(self, target, source, output):
  491         assert source != None
  492         args = ['xdelta1', 'delta', '-q', source, target, output]
  493         RunCommand(args, [0, 1])
  494     #end
  495 
  496     def Decode(self, input, source, output):
  497         assert source != None
  498         args = ['xdelta1', 'patch', '-q', input, source, output]
  499         # Note: for dumb historical reasons, xdelta1 returns 1 or 0
  500         RunCommand(args)
  501     #end
  502 
  503     def Verify(self, target, recon):
  504         RunCommand(('cmp', target, recon))
  505     #end
  506 
  507     def EncodeSize(self, output):
  508         return os.stat(output).st_size
  509     #end
  510 #end
  511 
  512 # exceptions
  513 class SkipRcsException:
  514     def __init__(self,reason):
  515         self.reason = reason
  516     #end
  517 #end
  518 
  519 class NotEnoughVersions:
  520     def __init__(self):
  521         pass
  522     #end
  523 #end
  524 
  525 class CommandError:
  526     def __init__(self,cmd,str):
  527         if type(cmd) is types.TupleType or \
  528            type(cmd) is types.ListType:
  529             cmd = reduce(lambda x,y: '%s %s' % (x,y),cmd)
  530         #end
  531         print 'command was: ',cmd
  532         print 'command failed: ',str
  533         print 'have fun debugging'
  534     #end
  535 #end
  536 
  537 class RcsVersion:
  538     def __init__(self,vstr):
  539         self.vstr = vstr
  540     #end
  541     def __cmp__(self,other):
  542         return cmp(self.date, other.date)
  543     #end
  544     def __str__(self):
  545         return str(self.vstr)
  546     #end
  547 #end
  548 
  549 class RcsFile:
  550 
  551     def __init__(self, fname):
  552         self.fname    = fname
  553         self.versions = []
  554         self.state    = HEAD_STATE
  555     #end
  556 
  557     def SetTotRev(self,s):
  558         self.totrev = int(s)
  559     #end
  560 
  561     def Rev(self,s):
  562         self.rev = RcsVersion(s)
  563         if len(self.versions) >= self.totrev:
  564             raise SkipRcsException('too many versions (in log messages)')
  565         #end
  566         self.versions.append(self.rev)
  567     #end
  568 
  569     def Date(self,s):
  570         self.rev.date = s
  571     #end
  572 
  573     def Match(self, line, state, rx, gp, newstate, f):
  574         if state == self.state:
  575             m = rx.match(line)
  576             if m:
  577                 if f:
  578                     f(m.group(gp))
  579                 #end
  580                 self.state = newstate
  581                 return 1
  582             #end
  583         #end
  584         return None
  585     #end
  586 
  587     def Sum1Rlog(self):
  588         f = os.popen('rlog '+self.fname, "r")
  589         l = f.readline()
  590         while l:
  591             if self.Match(l, HEAD_STATE, RE_TOTREV, 1, BAR_STATE, self.SetTotRev):
  592                 pass
  593             elif self.Match(l, BAR_STATE, RE_BAR, 1, REV_STATE, None):
  594                 pass
  595             elif self.Match(l, REV_STATE, RE_REV, 1, DATE_STATE, self.Rev):
  596                 pass
  597             elif self.Match(l, DATE_STATE, RE_DATE, 1, BAR_STATE, self.Date):
  598                 pass
  599             #end
  600             l = f.readline()
  601         #end
  602         c = f.close()
  603         if c != None:
  604             raise c
  605         #end
  606     #end
  607 
  608     def Sum1(self):
  609         st = os.stat(self.fname)
  610         self.rcssize = st.st_size
  611         self.Sum1Rlog()
  612         if self.totrev != len(self.versions):
  613             raise SkipRcsException('wrong version count')
  614         #end
  615         self.versions.sort()
  616     #end
  617 
  618     def Checkout(self,n):
  619         v      = self.versions[n]
  620         out    = open(self.Verf(n), "w")
  621         cmd    = 'co -ko -p%s %s' % (v.vstr, self.fname)
  622         total  = 0
  623         (inf,
  624          stream,
  625          err)  = os.popen3(cmd, "r")
  626         inf.close()
  627         buf    = stream.read()
  628         while buf:
  629             total = total + len(buf)
  630             out.write(buf)
  631             buf = stream.read()
  632         #end
  633         v.vsize = total
  634         estr = ''
  635         buf = err.read()
  636         while buf:
  637             estr = estr + buf
  638             buf = err.read()
  639         #end
  640         if stream.close():
  641             raise CommandError(cmd, 'checkout failed: %s\n%s\n%s' % (v.vstr, self.fname, estr))
  642         #end
  643         out.close()
  644         err.close()
  645     #end
  646 
  647     def Vdate(self,n):
  648         return self.versions[n].date
  649     #end
  650 
  651     def Vstr(self,n):
  652         return self.versions[n].vstr
  653     #end
  654 
  655     def Verf(self,n):
  656         return os.path.join(TMPDIR, 'input.%d' % n)
  657     #end
  658 
  659     def FilePairsByDate(self, runclass):
  660         if self.totrev < 2:
  661             raise NotEnoughVersions()
  662         #end
  663         self.Checkout(0)
  664         ntrials = []
  665         if self.totrev < 2:
  666             return vtrials
  667         #end
  668         for v in range(0,self.totrev-1):
  669             if v > 1:
  670                 os.remove(self.Verf(v-1))
  671             #end
  672             self.Checkout(v+1)
  673             if os.stat(self.Verf(v)).st_size < MIN_SIZE or \
  674                os.stat(self.Verf(v+1)).st_size < MIN_SIZE:
  675                 continue
  676             #end
  677 
  678             result = TimedTest(self.Verf(v+1),
  679                                self.Verf(v),
  680                                runclass.New())
  681 
  682             target_size = os.stat(self.Verf(v+1)).st_size
  683 
  684             ntrials.append(result)
  685         #end
  686 
  687         os.remove(self.Verf(self.totrev-1))
  688         os.remove(self.Verf(self.totrev-2))
  689         return ntrials
  690     #end
  691 
  692     def AppendVersion(self, f, n):
  693         self.Checkout(n)
  694         rf = open(self.Verf(n), "r")
  695         data = rf.read()
  696         f.write(data)
  697         rf.close()
  698         return len(data)
  699     #end
  700 
  701 class RcsFinder:
  702     def __init__(self):
  703         self.subdirs  = []
  704         self.rcsfiles = []
  705         self.others   = []
  706         self.skipped  = []
  707         self.biground = 0
  708     #end
  709 
  710     def Scan1(self,dir):
  711         dents = os.listdir(dir)
  712         subdirs  = []
  713         rcsfiles = []
  714         others   = []
  715         for dent in dents:
  716             full = os.path.join(dir, dent)
  717             if os.path.isdir(full):
  718                 subdirs.append(full)
  719             elif dent[len(dent)-2:] == ",v":
  720                 rcsfiles.append(RcsFile(full))
  721             else:
  722                 others.append(full)
  723             #end
  724         #end
  725         self.subdirs  = self.subdirs  + subdirs
  726         self.rcsfiles = self.rcsfiles + rcsfiles
  727         self.others   = self.others   + others
  728         return subdirs
  729     #end
  730 
  731     def Crawl(self, dir):
  732         subdirs = [dir]
  733         while subdirs:
  734             s1 = self.Scan1(subdirs[0])
  735             subdirs = subdirs[1:] + s1
  736         #end
  737     #end
  738 
  739     def Summarize(self):
  740         good = []
  741         for rf in self.rcsfiles:
  742             try:
  743                 rf.Sum1()
  744                 if rf.totrev < 2:
  745                     raise SkipRcsException('too few versions (< 2)')
  746                 #end
  747             except SkipRcsException, e:
  748                 #print 'skipping file %s: %s' % (rf.fname, e.reason)
  749                 self.skipped.append(rf)
  750             else:
  751                 good.append(rf)
  752             #end
  753         self.rcsfiles = good
  754     #end
  755 
  756     def AllPairsByDate(self, runclass):
  757         results = []
  758         good = []
  759         for rf in self.rcsfiles:
  760             try:
  761                 results = results + rf.FilePairsByDate(runclass)
  762             except SkipRcsException:
  763                 print 'file %s has compressed versions: skipping' % (rf.fname)
  764             except NotEnoughVersions:
  765                 print 'testing %s on %s: not enough versions' % (runclass, rf.fname)
  766             else:
  767                 good.append(rf)
  768             #end
  769         self.rcsfiles = good
  770         self.ReportPairs(runclass, results)
  771         return results
  772     #end
  773 
  774     def ReportPairs(self, name, results):
  775         encode_time = 0
  776         decode_time = 0
  777         encode_size = 0
  778         for r in results:
  779             encode_time += r.encode_time.mean
  780             decode_time += r.decode_time.mean
  781             encode_size += r.encode_size
  782         #end
  783         print '%s rcs: encode %.2f s: decode %.2f s: size %d' % \
  784               (name, encode_time, decode_time, encode_size)
  785     #end
  786 
  787     def MakeBigFiles(self, rand):
  788         f1 = open(TMPDIR + "/big.1", "w")
  789         f2 = open(TMPDIR + "/big.2", "w")
  790         population = []
  791         for file in self.rcsfiles:
  792             if len(file.versions) < 2:
  793                 continue
  794             population.append(file)
  795         #end
  796         f1sz = 0
  797         f2sz = 0
  798         fcount = int(len(population) * FILE_P)
  799         assert fcount > 0
  800         for file in rand.sample(population, fcount):
  801             m = IGNORE_FILENAME.match(file.fname)
  802             if m != None:
  803                 continue
  804             #end
  805             r1, r2 = rand.sample(xrange(0, len(file.versions)), 2)
  806             f1sz += file.AppendVersion(f1, r1)
  807             f2sz += file.AppendVersion(f2, r2)
  808             #m.update('%s,%s,%s ' % (file.fname[len(RCSDIR):], 
  809             #file.Vstr(r1), file.Vstr(r2)))
  810         #end
  811         testkey = 'rcs%d' % self.biground
  812         self.biground = self.biground + 1
  813 
  814         print '%s; source %u bytes; target %u bytes' % (testkey, f1sz, f2sz)
  815         f1.close()
  816         f2.close()
  817         return (TMPDIR + "/big.1",
  818                 TMPDIR + "/big.2",
  819                 testkey)
  820     #end
  821 
  822     def Generator(self):
  823         return lambda rand: self.MakeBigFiles(rand)
  824     #end
  825 #end
  826 
  827 # find a set of RCS files for testing
  828 def GetTestRcsFiles():
  829     rcsf = RcsFinder()
  830     rcsf.Crawl(RCSDIR)
  831     if len(rcsf.rcsfiles) == 0:
  832         raise CommandError('', 'no RCS files')
  833     #end
  834     rcsf.Summarize()
  835     print "rcsfiles: rcsfiles %d; subdirs %d; others %d; skipped %d" % (
  836         len(rcsf.rcsfiles),
  837         len(rcsf.subdirs),
  838         len(rcsf.others),
  839         len(rcsf.skipped))
  840     print StatList([x.rcssize for x in rcsf.rcsfiles], "rcssize").str
  841     print StatList([x.totrev for x in rcsf.rcsfiles], "totrev").str
  842     return rcsf
  843 #end
  844 
  845 class SampleDataTest:
  846     def __init__(self, dirs):
  847         dirs_in = dirs
  848         self.pairs = []
  849         while dirs:
  850             d = dirs[0]
  851             dirs = dirs[1:]
  852             l = os.listdir(d)
  853             files = []
  854             for e in l:
  855                 p = os.path.join(d, e)
  856                 if os.path.isdir(p):
  857                     dirs.append(p)
  858                 else:
  859                     files.append(p)
  860                 #end
  861             #end
  862             if len(files) > 1:
  863                 files.sort()
  864                 for x in xrange(len(files)):
  865                     for y in xrange(len(files)):
  866                         self.pairs.append((files[x], files[y],
  867                                            '%s-%s' % (files[x], files[y])))
  868                     #end
  869                 #end
  870             #end
  871         #end
  872         print "Sample data test using %d file pairs in %s" % (
  873             len(self.pairs), dirs_in)
  874     #end
  875 
  876     def Generator(self):
  877         return lambda rand: rand.choice(self.pairs)
  878     #end
  879 #end
  880 
  881 # configs are represented as a list of values,
  882 # program takes a list of strings:
  883 def ConfigToArgs(config):
  884     args = [ '-C',
  885              ','.join([str(x) for x in config[0:SOFT_CONFIG_CNT]])]
  886     for i in range(SOFT_CONFIG_CNT, len(CONFIG_ORDER)):
  887         key = CONFIG_ARGMAP[CONFIG_ORDER[i]]
  888         val = config[i]
  889         if val == 'true' or val == 'false':
  890             if val == 'true':
  891                 args.append('%s' % key)
  892             #end
  893         else:
  894             args.append('%s=%s' % (key, val))
  895         #end
  896     #end
  897     return args
  898 #end
  899 
  900 #
  901 class RandomTest:
  902     def __init__(self, tnum, tinput, config, syntuple = None):
  903         self.mytinput = tinput[2]
  904         self.myconfig = config
  905         self.tnum = tnum
  906 
  907         if syntuple != None:
  908             self.runtime = syntuple[0]
  909             self.compsize = syntuple[1]
  910             self.decodetime = None
  911         else:
  912             args = ConfigToArgs(config)
  913             result = TimedTest(tinput[1], tinput[0], Xdelta3Runner(args))
  914 
  915             self.runtime = result.encode_time.mean
  916             self.compsize = result.encode_size
  917             self.decodetime = result.decode_time.mean
  918         #end
  919 
  920         self.score = None
  921         self.time_pos = None
  922         self.size_pos = None
  923         self.score_pos = None
  924     #end
  925 
  926     def __str__(self):
  927         decodestr = ' %s' % self.decodetime
  928         return 'time %.6f%s size %d%s << %s >>%s' % (
  929             self.time(), ((self.time_pos != None) and 
  930                           (" (%s)" % self.time_pos) or ""),
  931             self.size(), ((self.size_pos != None) and 
  932                           (" (%s)" % self.size_pos) or ""),
  933             c2str(self.config()),
  934             decodestr)
  935     #end
  936 
  937     def time(self):
  938         return self.runtime
  939     #end
  940 
  941     def size(self):
  942         return self.compsize
  943     #end
  944 
  945     def config(self):
  946         return self.myconfig
  947     #end
  948 
  949     def score(self):
  950         return self.score
  951     #end
  952 
  953     def tinput(self):
  954         return self.mytinput
  955     #end
  956 #end
  957 
  958 def PosInAlist(l, e):
  959     for i in range(0, len(l)):
  960         if l[i][1] == e:
  961             return i;
  962         #end
  963     #end
  964     return -1
  965 #end
  966 
  967 # Generates a set of num_results test configurations, given the list of
  968 # retest-configs.
  969 def RandomTestConfigs(rand, input_configs, num_results):
  970 
  971     outputs = input_configs[:]
  972     have_set = dict([(c,c) for c in input_configs])
  973 
  974     # Compute a random configuration
  975     def RandomConfig():
  976         config = []
  977         cmap = {}
  978         for key in CONFIG_ORDER:
  979             val = cmap[key] = (INPUT_SPEC(rand)[key])(cmap)
  980             config.append(val)
  981         #end
  982         return tuple(config)
  983     #end
  984 
  985     while len(outputs) < num_results:
  986         newc = None
  987         for i in xrange(100):
  988             c = RandomConfig()
  989             if have_set.has_key(c):
  990                 continue
  991             #end
  992             have_set[c] = c
  993             newc = c
  994             break
  995         if newc is None:
  996             print 'stopped looking for configs at %d' % len(outputs)
  997             break
  998         #end
  999         outputs.append(c)
 1000     #end
 1001     outputs.sort()
 1002     return outputs
 1003 #end
 1004 
 1005 def RunOptimizationLoop(rand, generator, rounds):
 1006     configs = []
 1007     for rnum in xrange(rounds):
 1008         configs = RandomTestConfigs(rand, configs, MAX_RESULTS)
 1009         tinput = generator(rand)
 1010         tests = []
 1011         for x in xrange(len(configs)):
 1012             t = RandomTest(x, tinput, configs[x])
 1013             print 'Round %d test %d: %s' % (rnum, x, t)
 1014             tests.append(t)
 1015         #end
 1016         results = ScoreTests(tests)
 1017 
 1018         for r in results:
 1019             c = r.config()
 1020             if not test_all_config_results.has_key(c):
 1021                 test_all_config_results[c] = [r]
 1022             else:
 1023                 test_all_config_results[c].append(r)
 1024             #end
 1025         #end
 1026 
 1027         #GraphResults('expt%d' % rnum, results)
 1028         #GraphSummary('sum%d' % rnum, results)
 1029 
 1030         # re-test some fraction
 1031         configs = [r.config() for r in results[0:int(MAX_RESULTS * KEEP_P)]]
 1032     #end
 1033 #end
 1034 
 1035 # TODO: cleanup
 1036 test_all_config_results = {}
 1037 
 1038 def ScoreTests(results):
 1039     scored = []
 1040     timed = []
 1041     sized = []
 1042 
 1043     t_min = float(min([test.time() for test in results]))
 1044     #t_max = float(max([test.time() for test in results]))
 1045     s_min = float(min([test.size() for test in results]))
 1046     #s_max = float(max([test.size() for test in results]))
 1047 
 1048     for test in results:
 1049 
 1050         # Hyperbolic function. Smaller scores still better
 1051         red = 0.999  # minimum factors for each dimension are 1/1000
 1052         test.score = ((test.size() - s_min * red) *
 1053                       (test.time() - t_min * red))
 1054 
 1055         scored.append((test.score, test))
 1056         timed.append((test.time(), test))
 1057         sized.append((test.size(), test))
 1058     #end
 1059 
 1060     scored.sort()
 1061     timed.sort()
 1062     sized.sort()
 1063 
 1064     best_by_size = []
 1065     best_by_time = []
 1066 
 1067     pos = 0
 1068     for (score, test) in scored:
 1069         pos += 1
 1070         test.score_pos = pos
 1071     #end
 1072 
 1073     scored = [x[1] for x in scored]
 1074 
 1075     for test in scored:
 1076         test.size_pos = PosInAlist(sized, test)
 1077         test.time_pos = PosInAlist(timed, test)
 1078     #end
 1079 
 1080     for test in scored:
 1081         c = test.config()
 1082         s = 0.0
 1083         print 'H-Score: %0.9f %s' % (test.score, test)
 1084     #end
 1085 
 1086     return scored
 1087 #end
 1088 
 1089 def GraphResults(desc, results):
 1090     f = open("data-%s.csv" % desc, "w")
 1091     for r in results:
 1092         f.write("%0.9f\t%d\t# %s\n" % (r.time(), r.size(), r))
 1093     #end
 1094     f.close()
 1095     os.system("./plot.sh data-%s.csv plot-%s.jpg" % (desc, desc))
 1096 #end
 1097 
 1098 def GraphSummary(desc, results_ignore):
 1099     test_population = 0
 1100     config_ordered = []
 1101 
 1102     # drops duplicate test/config pairs (TODO: don't retest them)
 1103     for config, cresults in test_all_config_results.items():
 1104         input_config_map = {}
 1105         uniq = []
 1106         for test in cresults:
 1107             assert test.config() == config
 1108             test_population += 1
 1109             key = test.tinput()
 1110             if not input_config_map.has_key(key):
 1111                 input_config_map[key] = {}
 1112             #end
 1113             if input_config_map[key].has_key(config):
 1114                 print 'skipping repeat test %s vs. %s' % (input_config_map[key][config], test)
 1115                 continue
 1116             #end
 1117             input_config_map[key][config] = test
 1118             uniq.append(test)
 1119         #end
 1120         config_ordered.append(uniq)
 1121     #end
 1122 
 1123     # sort configs descending by number of tests
 1124     config_ordered.sort(lambda x, y: len(y) - len(x))
 1125 
 1126     print 'population %d: %d configs %d results' % \
 1127           (test_population,
 1128            len(config_ordered),
 1129            len(config_ordered[0]))
 1130 
 1131     if config_ordered[0] == 1:
 1132         return
 1133     #end
 1134 
 1135     # a map from test-key to test-list w/ various configs
 1136     input_set = {}
 1137     osize = len(config_ordered)
 1138 
 1139     for i in xrange(len(config_ordered)):
 1140         config = config_ordered[i][0].config()
 1141         config_tests = config_ordered[i]
 1142 
 1143         #print '%s has %d tested inputs' % (config, len(config_tests))
 1144 
 1145         if len(input_set) == 0:
 1146             input_set = dict([(t.tinput(), [t]) for t in config_tests])
 1147             continue
 1148         #end
 1149 
 1150         # a map from test-key to test-list w/ various configs
 1151         update_set = {}
 1152         for r in config_tests:
 1153             t = r.tinput()
 1154             if input_set.has_key(t):
 1155                 update_set[t] = input_set[t] + [r]
 1156             else:
 1157                 #print 'config %s does not have test %s' % (config, t)
 1158                 pass
 1159             #end
 1160         #end
 1161 
 1162         if len(update_set) <= 1:
 1163             break
 1164         #end
 1165 
 1166         input_set = update_set
 1167 
 1168         # continue if there are more w/ the same number of inputs
 1169         if i < (len(config_ordered) - 1) and \
 1170            len(config_ordered[i + 1]) == len(config_tests):
 1171             continue
 1172         #end
 1173 
 1174         # synthesize results for multi-test inputs
 1175         config_num = None
 1176 
 1177         # map of config to sum(various test-keys)
 1178         smap = {}
 1179         for (key, tests) in input_set.items():
 1180             if config_num == None:
 1181                 # config_num should be the same in all elements
 1182                 config_num = len(tests)
 1183                 smap = dict([(r.config(),
 1184                               (r.time(),
 1185                                r.size()))
 1186                              for r in tests])
 1187             else:
 1188                 # compuate the per-config sum of time/size
 1189                 assert config_num == len(tests)
 1190                 smap = dict([(r.config(),
 1191                               (smap[r.config()][0] + r.time(),
 1192                                smap[r.config()][1] + r.size()))
 1193                              for r in tests])
 1194             #end
 1195         #end
 1196 
 1197         if config_num == 1:
 1198             continue
 1199         #end
 1200 
 1201         if len(input_set) == osize:
 1202             break
 1203         #end
 1204 
 1205         summary = '%s-%d' % (desc, len(input_set))
 1206         osize = len(input_set)
 1207 
 1208         print 'generate %s w/ %d configs' % (summary, config_num)
 1209         syn = [RandomTest(0, (None, None, summary), config,
 1210                           syntuple = (smap[config][0], smap[config][1]))
 1211                for config in smap.keys()]
 1212         syn = ScoreTests(syn)
 1213         #print 'smap is %s' % (smap,)
 1214         #print 'syn is %s' % (' and '.join([str(x) for x in syn]))
 1215         #GraphResults(summary, syn)
 1216     #end
 1217 #end
 1218 
 1219 def RunRegressionTest(pairs, rounds):
 1220     for args in [
 1221         [],
 1222         ['-S=djw'],
 1223         ['-B=412907520'],
 1224         ['-B 412907520', ],
 1225 
 1226                  ]:
 1227         print "Args %s" % (args)
 1228         for (file1, file2, testkey) in pairs:
 1229             ttest = TimedTest(file1, file2, Xdelta3Runner(args, forkexec=True),
 1230                               skip_trials = 0,
 1231                               min_trials = 1,
 1232                               max_trials = 1)
 1233             print "Source %s\nTarget %s\nEncode %s\nDecode %s\nSize %s\n\n" % (
 1234                 file1, file2,
 1235                 ttest.encode_time.str,
 1236                 ttest.decode_time.str,
 1237                 ttest.encode_size)
 1238     #end
 1239 #end
 1240 
 1241 if __name__ == "__main__":
 1242     try:
 1243         RunCommand(['rm', '-rf', TMPDIR])
 1244         os.mkdir(TMPDIR)
 1245 
 1246         #rcsf = GetTestRcsFiles()
 1247         #generator = rcsf.Generator()
 1248 
 1249         sample = SampleDataTest([SAMPLEDIR])
 1250         generator = sample.Generator()
 1251 
 1252         rand = random.Random(135135135135135)
 1253 
 1254         RunRegressionTest(sample.pairs, TEST_ROUNDS)
 1255 
 1256         #RunSpeedTest()
 1257 
 1258         # the idea below is to add the default configurations and
 1259         # xdelta1 to the optimization loop:
 1260         #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-1', '-3', '-6']))
 1261         #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-9']))
 1262         #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-9', '-S', 'djw']))
 1263         #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-1', '-S', 'djw']))
 1264         #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-9', '-T']))
 1265         #x1r = rcsf.AllPairsByDate(Xdelta1RunClass())
 1266 
 1267     except CommandError:
 1268         pass
 1269     else:
 1270         RunCommand(['rm', '-rf', TMPDIR])
 1271         pass
 1272     #end
 1273 #end