diff options
-rwxr-xr-x | fixup-perf-csv.py | 54 |
1 files changed, 11 insertions, 43 deletions
diff --git a/fixup-perf-csv.py b/fixup-perf-csv.py index 4e9e834..338dd33 100755 --- a/fixup-perf-csv.py +++ b/fixup-perf-csv.py @@ -7,55 +7,23 @@ import pandas as pd # (a) If dso is not present in perf.csv, it adds a dummy dso column. # (b) We don't have dso entry present for binary and libraries, so create a dummy one. # (c) Strip whitespace in dso name. -# (d) Sets status field of symbol entries to 'na' and removes symbol entries -# for failed benchmarks. def main(): - assert len(sys.argv) == 3 - perf_df = pd.read_csv(sys.argv[1]) - status_df = pd.read_csv(sys.argv[2]) - - if "dso" not in list(perf_df.columns.values): - perf_df["dso"] = "na" - perf_df = perf_df.fillna("na") - perf_df["dso"] = perf_df["dso"].str.strip() - - merged_df = pd.merge(perf_df, status_df, how="outer", on=["benchmark", "symbol"]) - # When all benchmarks have failed, perf-tmp.csv (perf_df) is empty. - # and this messes up order of columns while merging. Rearrange the columns - # to "expected order" with benchmark,symbol appearing first. + assert len(sys.argv) == 2 + df = pd.read_csv(sys.argv[1]) + if "dso" not in list(df.columns.values): + df["dso"] = "na" + df = df.fillna("na") + df["dso"] = df["dso"].str.strip() + # When all benchmarks have failed, perf.csv is empty, and this messes up + # order of columns while merging. Rearrange the columns to "expected order" + # with benchmark,symbol appearing first. # The order of columns shouldn't really be an issue, but we need # (or needed at some point) for benchmark to be the first metric, and thus # assert for it in merge-metric-csvs.py. This needs to be re-checked after # we move away from csvs2table.py. - merged_df = merged_df[["benchmark", "symbol", "sample", "dso", "status"]] - - merged_df["sample"] = merged_df["sample"].fillna(-1).astype("int") - merged_df["dso"] = merged_df["dso"].fillna("na") - merged_df["status"] = merged_df["status"].fillna("na") - - # FIXME: We end up having duplicates in status.csv for some runs. - # Remove duplicate entries from merged_df, if they occur in either perf-tmp.csv - # or status.csv. - merged_df = merged_df.drop_duplicates() - - # Iterate over each row in merged_df, and only keep symbol - # entries for those bmks that ran successfully, so later metric and comparison - # scripts don't process symbols for failed benchmarks like LLVM-1070. - # FIXME: Growing a dataframe inside a loop can end up being terribly inefficient, - # revisit later. - res_df = pd.DataFrame(columns=merged_df.columns) - keep_symbol_row = True - for index, row in merged_df.iterrows(): - if row['status'] != 'na' or keep_symbol_row: - res_df.loc[len(res_df)] = row - if row['status'] != 'na': - keep_symbol_row = True if row['status'] == 'success' \ - else False - - # Output perf.csv combined with status info and symbol entries removed - # for failed benchmarks. - res_df.to_csv(sys.stdout, index=False) + df = df[["benchmark", "symbol", "sample", "dso"]] + df.to_csv(sys.stdout, index=False) if __name__ == "__main__": main() |