scripts/bench.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205

#!/usr/bin/env python

"""Simple harness that benchmarks different variants of the routines,
caches the results, and emits all of the records at the end.

Results are generated for different values of:
 * Source
 * Routine
 * Length
 * Alignment
"""

import argparse
import subprocess
import math
import sys

SINGLE_BUFFER_FUNCTIONS = ['strchr', 'memset', 'strlen', 'memchr']
DUAL_BUFFER_FUNCTIONS = ['memcmp', 'memcpy', 'strcmp', 'strcpy']

FUNCTIONS = list(SINGLE_BUFFER_FUNCTIONS)
FUNCTIONS.extend(DUAL_BUFFER_FUNCTIONS)

HAS = {
    'this': 'bounce memchr memcpy memset strchr strcpy strlen',
    'bionic-a9': 'memcmp memcpy memset strcmp strcpy strlen',
    'bionic-a15': 'memcmp memcpy memset strcmp strcpy strlen',
    'bionic-c': FUNCTIONS,
    'csl': 'memcpy memset',
    'glibc': 'memcpy memset strchr strlen',
    'glibc-c': FUNCTIONS,
    'newlib': 'memcpy strcmp strcpy strlen',
    'newlib-c': FUNCTIONS,
    'newlib-xscale': 'memchr memcpy memset strchr strcmp strcpy strlen',
    'plain': 'memset memcpy strcmp strcpy',
}

ALIGNMENTS = {
    'bounce': ['1'],
}

VARIANTS = sorted(HAS.keys())

NUM_RUNS = 5

DRY_RUN = False

#CLI helpers
def parse_alignments(alignment):
    e = Exception("Alignments must be expressed as colon-separated digits e.g. 8:32 16:16")
    alignments = alignment.split(':')
    if len(alignments) != 2:
        raise e
    try:
        [int(x) for x in alignments]
    except:
        raise e
    return alignment


def run(cache, variant, function, bytes, loops, alignment, run_id, quiet=False):
    """Perform a single run, exercising the cache as appropriate."""
    key = ':'.join('%s' % x for x in (variant, function, bytes, loops, alignment, run_id))

    if key in cache:
        got = cache[key]
    else:
        xbuild = build + "/try-"
        cmd = '%(xbuild)s%(variant)s -t %(function)s -c %(bytes)s -l %(loops)s -a %(alignment)s -r %(run_id)s' % locals()

        if(DRY_RUN):
            print cmd
            return 1
        else:
            try:
                got = subprocess.check_output(cmd.split()).strip()
            except OSError, ex:
                assert False, 'Error %s while running %s' % (ex, cmd)

    parts = got.split(':')
    took = float(parts[7])

    cache[key] = got

    if not quiet:
        print got
        sys.stdout.flush()

    return took

def run_many(cache, variants, bytes, all_functions):
    # We want the data to come out in a useful order.  So fix an
    # alignment and function, and do all sizes for a variant first
    bytes = sorted(bytes)
    mid = bytes[int(len(bytes)/1.5)]

    if not all_functions:
        # Use the ordering in 'this' as the default
        all_functions = HAS['this'].split()

        # Find all other functions
        for functions in HAS.values():
            for function in functions.split():
                if function not in all_functions:
                    all_functions.append(function)

    for function in all_functions:
        for alignment in ALIGNMENTS[function]:
            for variant in variants:
                if function not in HAS[variant].split():
                    continue

                # Run a tracer through and see how long it takes and
                # adjust the number of loops based on that.  Not great
                # for memchr() and similar which are O(n), but it will
                # do
                f = 50000000
                want = 5.0

                loops = int(f / math.sqrt(max(1, mid)))
                took = run(cache, variant, function, mid, loops, alignment, 0,
                           quiet=True)
                # Keep it reasonable for silly routines like bounce
                factor = min(20, max(0.05, want/took))
                f = f * factor
                
                # Round f to a few significant figures
                scale = 10**int(math.log10(f) - 1)
                f = scale*int(f/scale)

                for b in sorted(bytes):
                    # Figure out the number of loops to give a roughly consistent run
                    if NUM_LOOPS == 0:
                        loops = int(f / math.sqrt(max(1, b)))
                    else:
                        loops = NUM_LOOPS
                    for run_id in range(0, NUM_RUNS):
                        run(cache, variant, function, b, loops, alignment,
                            run_id)

def run_top(cache):
    parser = argparse.ArgumentParser()
    #Syntax: python ../cortex-strings/scripts/bench.py -f bounce memcpy -v this glibc
    parser.add_argument("-v", "--variants", nargs="+", help="library variant to run (run all if not specified)", default = VARIANTS, choices = VARIANTS)
    parser.add_argument("-f", "--functions", nargs="+", help="function to run (run all if not specified)", default = FUNCTIONS, choices = FUNCTIONS)
    parser.add_argument("-u", "--upper", type=int, help="upper limit to test to (in bytes)", default = 512*1024)
    parser.add_argument("-l", "--lower", type=int, help="lowest block size to test (bytes)", default = 0)
    parser.add_argument("-s", "--steps", nargs="+", help="steps to test powers of", default = ['1.4', '2.0'])
    parser.add_argument("-p", "--prefix", help="path to executables, relative to CWD", default=".")
    parser.add_argument("-d", "--dry-run", help="Dry run: just print the invocations that we would use", default=False, action="store_true")
    parser.add_argument("-a", "--alignments", nargs="+", type=parse_alignments, help="Alignments, e.g. 2:32 for 2-byte-aligned source to 4-byte-aligned dest. Functions with just a dest use the number before the colon.", default=['1:32', '2:32', '4:32', '8:32', '16:32', '32:32'])
    parser.add_argument("-r", "--runs", type=int, help="Number of runs of each test", default=5)
    parser.add_argument("-o", "--loops", type=int, help="Lock the number of memcpy loops to perform", default = 0)
    args = parser.parse_args()

    if(args.lower >= args.upper):
      raise Exception("Range starts after it ends!")

    global build, DRY_RUN, ALIGNMENTS, NUM_RUNS, NUM_LOOPS
    NUM_LOOPS = args.loops
    NUM_RUNS = args.runs
    build = args.prefix
    DRY_RUN = args.dry_run
    for function in SINGLE_BUFFER_FUNCTIONS:
        ALIGNMENTS[function] = [x.split(':')[0] for x in args.alignments]
    for function in DUAL_BUFFER_FUNCTIONS:
        ALIGNMENTS[function] = args.alignments

    bytes = []
    
    #Test powers of steps
    for step in args.steps:
        if step[0] == '+':
            step = int(step[1:])
            bytes.extend(range(args.lower, args.upper + step, step))
        else:
            step = float(step)
            steps = int(round(math.log(args.upper - args.lower, step)))
            bytes.extend([args.lower - 1 + int(step**x) for x in range(steps+1)])

    run_many(cache, args.variants, bytes, args.functions)

def main():
    cachename = 'cache.txt'

    cache = {}

    try:
        with open(cachename) as f:
            for line in f:
                line = line.strip()
                parts = line.split(':')
                cache[':'.join(parts[:7])] = line
    except:
        pass

    try:
        run_top(cache)
    finally:
        with open(cachename, 'w') as f:
            for line in sorted(cache.values()):
                print >> f, line

if __name__ == '__main__':
    main()