1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
|
#!/usr/bin/env python
"""Simple harness that benchmarks different variants of the routines,
caches the results, and emits all of the records at the end.
Results are generated for different values of:
* Source
* Routine
* Length
* Alignment
"""
import argparse
import subprocess
import math
import sys
SINGLE_BUFFER_FUNCTIONS = ['strchr', 'memset', 'strlen', 'memchr']
DUAL_BUFFER_FUNCTIONS = ['memcmp', 'memcpy', 'strcmp', 'strcpy']
FUNCTIONS = list(SINGLE_BUFFER_FUNCTIONS)
FUNCTIONS.extend(DUAL_BUFFER_FUNCTIONS)
HAS = {
'this': 'bounce memchr memcpy memset strchr strcpy strlen',
'bionic-a9': 'memcmp memcpy memset strcmp strcpy strlen',
'bionic-a15': 'memcmp memcpy memset strcmp strcpy strlen',
'bionic-c': FUNCTIONS,
'csl': 'memcpy memset',
'glibc': 'memcpy memset strchr strlen',
'glibc-c': FUNCTIONS,
'newlib': 'memcpy strcmp strcpy strlen',
'newlib-c': FUNCTIONS,
'newlib-xscale': 'memchr memcpy memset strchr strcmp strcpy strlen',
'plain': 'memset memcpy strcmp strcpy',
}
ALIGNMENTS = {
'bounce': ['1'],
}
VARIANTS = sorted(HAS.keys())
NUM_RUNS = 5
DRY_RUN = False
#CLI helpers
def parse_alignments(alignment):
e = Exception("Alignments must be expressed as colon-separated digits e.g. 8:32 16:16")
alignments = alignment.split(':')
if len(alignments) != 2:
raise e
try:
[int(x) for x in alignments]
except:
raise e
return alignment
def run(cache, variant, function, bytes, loops, alignment, run_id, quiet=False):
"""Perform a single run, exercising the cache as appropriate."""
key = ':'.join('%s' % x for x in (variant, function, bytes, loops, alignment, run_id))
if key in cache:
got = cache[key]
else:
xbuild = build + "/try-"
cmd = '%(xbuild)s%(variant)s -t %(function)s -c %(bytes)s -l %(loops)s -a %(alignment)s -r %(run_id)s' % locals()
if(DRY_RUN):
print cmd
return 1
else:
try:
got = subprocess.check_output(cmd.split()).strip()
except OSError, ex:
assert False, 'Error %s while running %s' % (ex, cmd)
parts = got.split(':')
took = float(parts[7])
cache[key] = got
if not quiet:
print got
sys.stdout.flush()
return took
def run_many(cache, variants, bytes, all_functions):
# We want the data to come out in a useful order. So fix an
# alignment and function, and do all sizes for a variant first
bytes = sorted(bytes)
mid = bytes[int(len(bytes)/1.5)]
if not all_functions:
# Use the ordering in 'this' as the default
all_functions = HAS['this'].split()
# Find all other functions
for functions in HAS.values():
for function in functions.split():
if function not in all_functions:
all_functions.append(function)
for function in all_functions:
for alignment in ALIGNMENTS[function]:
for variant in variants:
if function not in HAS[variant].split():
continue
# Run a tracer through and see how long it takes and
# adjust the number of loops based on that. Not great
# for memchr() and similar which are O(n), but it will
# do
f = 50000000
want = 5.0
loops = int(f / math.sqrt(max(1, mid)))
took = run(cache, variant, function, mid, loops, alignment, 0,
quiet=True)
# Keep it reasonable for silly routines like bounce
factor = min(20, max(0.05, want/took))
f = f * factor
# Round f to a few significant figures
scale = 10**int(math.log10(f) - 1)
f = scale*int(f/scale)
for b in sorted(bytes):
# Figure out the number of loops to give a roughly consistent run
if NUM_LOOPS == 0:
loops = int(f / math.sqrt(max(1, b)))
else:
loops = NUM_LOOPS
for run_id in range(0, NUM_RUNS):
run(cache, variant, function, b, loops, alignment,
run_id)
def run_top(cache):
parser = argparse.ArgumentParser()
#Syntax: python ../cortex-strings/scripts/bench.py -f bounce memcpy -v this glibc
parser.add_argument("-v", "--variants", nargs="+", help="library variant to run (run all if not specified)", default = VARIANTS, choices = VARIANTS)
parser.add_argument("-f", "--functions", nargs="+", help="function to run (run all if not specified)", default = FUNCTIONS, choices = FUNCTIONS)
parser.add_argument("-u", "--upper", type=int, help="upper limit to test to (in bytes)", default = 512*1024)
parser.add_argument("-l", "--lower", type=int, help="lowest block size to test (bytes)", default = 0)
parser.add_argument("-s", "--steps", nargs="+", help="steps to test powers of", default = ['1.4', '2.0'])
parser.add_argument("-p", "--prefix", help="path to executables, relative to CWD", default=".")
parser.add_argument("-d", "--dry-run", help="Dry run: just print the invocations that we would use", default=False, action="store_true")
parser.add_argument("-a", "--alignments", nargs="+", type=parse_alignments, help="Alignments, e.g. 2:32 for 2-byte-aligned source to 4-byte-aligned dest. Functions with just a dest use the number before the colon.", default=['1:32', '2:32', '4:32', '8:32', '16:32', '32:32'])
parser.add_argument("-r", "--runs", type=int, help="Number of runs of each test", default=5)
parser.add_argument("-o", "--loops", type=int, help="Lock the number of memcpy loops to perform", default = 0)
args = parser.parse_args()
if(args.lower >= args.upper):
raise Exception("Range starts after it ends!")
global build, DRY_RUN, ALIGNMENTS, NUM_RUNS, NUM_LOOPS
NUM_LOOPS = args.loops
NUM_RUNS = args.runs
build = args.prefix
DRY_RUN = args.dry_run
for function in SINGLE_BUFFER_FUNCTIONS:
ALIGNMENTS[function] = [x.split(':')[0] for x in args.alignments]
for function in DUAL_BUFFER_FUNCTIONS:
ALIGNMENTS[function] = args.alignments
bytes = []
#Test powers of steps
for step in args.steps:
if step[0] == '+':
step = int(step[1:])
bytes.extend(range(args.lower, args.upper + step, step))
else:
step = float(step)
steps = int(round(math.log(args.upper - args.lower, step)))
bytes.extend([args.lower - 1 + int(step**x) for x in range(steps+1)])
run_many(cache, args.variants, bytes, args.functions)
def main():
cachename = 'cache.txt'
cache = {}
try:
with open(cachename) as f:
for line in f:
line = line.strip()
parts = line.split(':')
cache[':'.join(parts[:7])] = line
except:
pass
try:
run_top(cache)
finally:
with open(cachename, 'w') as f:
for line in sorted(cache.values()):
print >> f, line
if __name__ == '__main__':
main()
|