aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMathieu Poirier <mathieu.poirier@linaro.org>2020-12-15 10:59:39 -0700
committerMathieu Poirier <mathieu.poirier@linaro.org>2020-12-16 11:39:08 -0700
commit7ac865480d014bd2a3137196727bbffe4348af6e (patch)
tree994c93114d7dc5cb48ed894f9099f09c7d38b3cf
Initial commit
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
-rw-r--r--README.txt133
-rwxr-xr-xget-mainline-commit48
-rwxr-xr-xget-mainline-feature50
-rwxr-xr-xnegative-set45
-rwxr-xr-xpatchnet-set53
-rwxr-xr-xpatchnet-write19
-rwxr-xr-xpositive-set60
-rwxr-xr-xtesting-set50
8 files changed, 458 insertions, 0 deletions
diff --git a/README.txt b/README.txt
new file mode 100644
index 0000000..50627fb
--- /dev/null
+++ b/README.txt
@@ -0,0 +1,133 @@
+#
+# SPDX-License-Identifier: GPL-2.0
+# (c) 2020,2022, Mathieu Poirier <mathieu.poirier@linaro.org>
+#
+
+1 ## Install GNU's parallel utility:
+
+mpoirier@xps15:~/work$ sudo apt-get install parallel
+
+2 ## Install git:
+
+mpoirier@xps15:~/work$ sudo apt-get git
+
+3 ## Add clone tools and add them to system PATH:
+
+mpoirier@xps15:~/work$ git clone https://git.linaro.org/people/mathieu.poirier/tooling.git
+mpoirier@xps15:~/work$ cd tooling
+mpoirier@xps15:~/work$ ls
+mpoirier@xps15:~/work/kwg-523/test$ ls tooling/
+get-mainline-commit get-mainline-feature negative-set patchnet-set patchnet-write positive-set README.txt
+mpoirier@xps15:~/work/tooling$ export PATH=$PATH:/home/mpoirier/work/tooling
+mpoirier@xps15:~/work/tooling$ chmod 755 *
+
+4 ## Download the PatchNetTool model:
+
+mpoirier@xps15:~/work$ mkdir patchnet
+mpoirier@xps15:~/work$ cd patchnet
+mpoirier@xps15:~/work/patchnet$ git init .
+mpoirier@xps15:~/work/patchnet$ git remote add hvdthong/PatchNetTool.git https://github.com/hvdthong/PatchNetTool.git
+
+mpoirier@xps15:~/work/patchnet$ git fetch hvdthong/PatchNetTool.git
+mpoirier@xps15:~/work/patchnet$ git checkout -b PatchNetTool.git/master hvdthong/PatchNetTool.git/master
+
+5 ## Download the PatchNet_updated model:
+
+mpoirier@xps15:~/work$ mkdir patchnet_updated
+mpoirier@xps15:~/work$ cd patchnet_updated
+mpoirier@xps15:~/work/patchnet_updated$ git init .
+mpoirier@xps15:~/work/patchnet_updated$ git remote add Xin-Zhou-smu/PatchNet_updated.git https://github.com/Xin-Zhou-smu/PatchNet_updated.git
+
+mpoirier@xps15:~/work/patchnet_updated$ git fetch Xin-Zhou-smu/PatchNet_updated.git
+mpoirier@xps15:~/work/patchnet_updated$ git checkout -b PatchNet_updated.git/patch-1 Xin-Zhou-smu/PatchNet_updated.git/patch-1
+
+6 ## Create kernel directory and download source:
+
+mpoirier@xps15:~/work$ mkdir kernel
+mpoirier@xps15:~/work$ cd kernel
+mpoirier@xps15:~/work/kernel$ git init .
+mpoirier@xps15:~/work/kernel$ git remote add torvalds/linux.git git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
+mpoirier@xps15:~/work/kernel$ git remote
+torvalds/linux.git
+mpoirier@xps15:~/work/kernel$ git fetch torvalds/linux.git
+
+7 ## Checkout the latest version:
+
+mpoirier@xps15:~/work/kernel$ git branch -r
+torvalds/linux.git/master
+mpoirier@xps15:~/work/kernel$ git checkout -b torvalds/linux.git/master-v5.8 v5.8
+Switched to a new branch 'torvalds/linux.git/master-v5.8'
+
+8 ## Get a handle on the stable trees:
+
+IMPORTANT: For the time being the name used for the stable trees must be the
+same as what follows. See script "positive-set" for details.
+
+mpoirier@xps15:~/work/kernel$ git remote add stable/linux-stable.git git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git
+mpoirier@xps15:~/work/kernel$ git fetch stable/linux-stable.git
+mpoirier@xps15:~/work/kernel$ git branch -r | grep linux-stable
+ stable/linux-stable.git/linux-2.6.11.y
+ stable/linux-stable.git/linux-2.6.12.y
+ stable/linux-stable.git/linux-2.6.13.y
+...
+...
+...
+ stable/linux-stable.git/linux-5.5.y
+ stable/linux-stable.git/linux-5.6.y
+ stable/linux-stable.git/linux-5.7.y
+ stable/linux-stable.git/master
+
+9 ## Generate the positive training set by gathering all the patches that have
+ been backported to stable kernels:
+
+mpoirier@xps15:~/work/kernel$ positive-set positive.txt
+(depending on the machine, this can take hours to complete)
+
+mpoirier@xps15:~/work/kernel$ wc -l positive.txt
+44397 positive.txt
+
+10 ## Generate the negative training set using the output, i.e file positive.txt,
+ from the above command:
+
+mpoirier@xps15:~/work/kernel$ negative-set positive.txt negative.txt v5.0..v5.8
+(depending on the machine, this can take hours to complete)
+
+mpoirier@xps15:~/work/kernel$ wc -l negative.txt
+82121 negative.txt
+
+11 ## Generate the input file to train PatchNetTool by using the output of the above
+ two commands:
+
+mpoirier@xps15:~/work/kernel$ patchnet-set -p positive.txt -n negative.txt train.txt
+mpoirier@xps15:~/work/kernel$ wc -l train.txt
+126518 training-set.txt
+
+12 ## Generate the input file to test new patches:
+
+mpoirier@xps15:~/work/kernel$ testing-set testing.txt v5.8..v5.9
+
+13 ## Use the training and test sets throught the preprocessing phase:
+
+mpoirier@xps15:~/work$ cp kernel/train.txt patchnet/preprocessing/
+mpoirier@xps15:~/work$ cp kernel/test.txt patchnet/preprocessing/
+mpoirier@xps15:~/work$ cd patchnet
+mpoirier@xps15:~/work/patchnet/preprocesing$ ./getinfo -j 36 --commit-list train.txt --git /home/linaro/kernel -o train
+(Depeding on the machine and the size of the training set, this can take hours)
+mpoirier@xps15:~/work/patchnet/preprocessing$ ./getinfo -j 36 --commit-list testing.txt --git /home/linaro/kernel -o testing
+(Depeding on the machine and the size of the training set, this can take hours)
+
+14 ## Generate .pkl data from .out files:
+
+mpoirier@xps15:~/work/patchnet/preprocessing$ cp train.out testing.out ../../patchnet_updated/
+mpoirier@xps15:~/work/patchnet$ cd ../../patchnet_updated/
+
+mpoirier@xps15:~/work/patchnet_updated$ python text2dict.py -text_path train.out -dict_path train.pkl
+
+mpoirier@xps15:~/work/patchnet_updated$ python text2dict.py -text_path 'testing.out' -dict_path 'test.pkl'
+
+mpoirier@xps15:~/work/patchnet_updated$ python generate_dict.py -text_path1 train.out -text_path2 testing.out -dict_path dict.pkl
+
+15 ## Train the model:
+
+mpoirier@xps15:~/work/patchnet_updated$ python main.py -train -train_data train.pkl -dictionary_data dict.pkl
+
diff --git a/get-mainline-commit b/get-mainline-commit
new file mode 100755
index 0000000..7262e3f
--- /dev/null
+++ b/get-mainline-commit
@@ -0,0 +1,48 @@
+#!/bin/bash
+#
+# SPDX-License-Identifier: GPL-2.0
+# (c) 2020,2022, Mathieu Poirier <mathieu.poirier@linaro.org>
+#
+# For each *stable* commit in range, output their mainline equivalent
+#
+# USAGE:
+# $ git log --format="%H" v4.14..v4.14.186 | parallel --link get-mainline-commit :::: - ::: $1
+# or
+# cat data.txt | parallel --link get-mainline-commit :::: - ::: output.file
+#
+# INPUT:
+# $1: A git generated SHA1, i.e "bcf876870b95592b52519ed4aafcf9d95999bc9c"
+# $2: Name of the output file
+#
+
+# Needed if input is coming from a file
+sha1=$(echo $1 | awk {'print $1'})
+
+# tagged commits don't carry useful information, simply return
+git describe --exact-match $sha1 &> /dev/null
+if [ $? -eq 0 ]; then
+ exit 0
+fi
+
+while read line ; do
+ # Look for the upsream commit at the top of the changelog
+ if [ "$(echo $line | grep -i "upstream" | wc -l)" -eq 0 ]; then
+ continue
+ fi
+
+ # Look for the SHA1, it is 40 character long
+ for word in $line; do
+ if [ ${#word} -eq 40 ]; then
+ # We have the SHA1, add it to the list only if it is
+ # not in there already
+ if [ "$(grep $word $2 | wc -l)" -eq 0 ]; then
+ git log -1 --format="%H %s" $word >> $2
+ fi
+
+ # No need to go further
+ exit 0
+ fi
+ done
+done < <(git log -1 --format="%b" $sha1)
+
+exit 0
diff --git a/get-mainline-feature b/get-mainline-feature
new file mode 100755
index 0000000..c8ffd72
--- /dev/null
+++ b/get-mainline-feature
@@ -0,0 +1,50 @@
+#!/bin/bash
+#
+# SPDX-License-Identifier: GPL-2.0
+# (c) 2020,2022, Mathieu Poirier <mathieu.poirier@linaro.org>
+#
+# Find all the commits that:
+# 1) Have a title that doesn't include any of the words in the dictionary
+# 2) Have not been back ported to any of the stable trees
+#
+# USAGE:
+# $ git log --no-merges --no-renames --format="%H" v5.6..v5.7 | parallel --link get-mainline-feature :::: - ::: stable.file ::: output.file
+# $ cat data.txt | parallel --link get-mainline-feature :::: - ::: stable.file ::: output.file
+#
+# INPUT:
+# $1: A git generated SHA1, i.e "bcf876870b95592b52519ed4aafcf9d95999bc9c"
+# $2: Output file from the command "positive-set"
+# $3: Name of the output file
+#
+
+dictionary=(MAINTAINER tag defconfig dts selftest script doc
+ bindings kbuild kconfig coccinelle mailmap checkpatch)
+
+# Needed if input is coming from a file
+sha1=$(echo $1 | awk {'print $1'})
+
+subj=$(git log -1 --pretty="%s" $sha1)
+if [ $? -gt 0 ]; then
+ exit 1
+fi
+
+# No point in going further if the commit is a tag
+git describe --exact-match $sha1 &> /dev/null
+if [ $? -eq 0 ]; then
+ exit 0
+fi
+
+# Skip patches with a title that includes one of the words in the dictionary
+for token in "${dictionary[@]}"; do
+ if [ "$(echo $subj | grep -i $token | wc -l)" -ne 0 ]; then
+ exit 0
+ fi
+done
+
+# See if $sha is part of the stable set in $2
+if [ "$(grep $sha1 $2 | wc -l)" -eq 0 ]; then
+ # This patch looks like a feature, add it to the log
+ echo "$sha1 $subj" >> $3
+fi
+
+exit 0
diff --git a/negative-set b/negative-set
new file mode 100755
index 0000000..14c8714
--- /dev/null
+++ b/negative-set
@@ -0,0 +1,45 @@
+#!/bin/bash
+#
+# SPDX-License-Identifier: GPL-2.0
+# (c) 2020,2022, Mathieu Poirier <mathieu.poirier@linaro.org>
+#
+# Finds all the commits within the commit range(s) that have not been pack
+# ported to a stable kernel tree.
+#
+# USAGE:
+# $ negative-set <postive.set> <outputfilename.out> <commit range> <commit range> ...
+#
+# INPUT:
+# $1: Output file from the command "positive-set"
+# $2: Name of the ouput file
+# $3: A valid kernel commit range, i.e v5.0..v5.2 or bcf876870b95..65550098c1c4
+#
+
+if [ $# -lt 3 ]; then
+ echo "Script needs at least two arguments"
+ exit 1
+fi
+
+# Make a backup of the output file if it already exists
+if [ -e "$2" ]; then
+ mv $2 $2.bkp
+fi
+
+# Needed by grep command in get-mainline-delta
+touch $2
+
+declare -i count=1
+
+for arg in "$@"; do
+ # Skip over the stable and filename argument
+ if [ $count -lt 3 ]; then
+ count+=1
+ continue
+ fi
+ echo processing $arg
+ git log --no-merges --no-renames --format="%H" $arg | parallel --link get-mainline-feature :::: - ::: $1 ::: $2
+done
+
+exit
+
+
diff --git a/patchnet-set b/patchnet-set
new file mode 100755
index 0000000..5a4e7c4
--- /dev/null
+++ b/patchnet-set
@@ -0,0 +1,53 @@
+#!/bin/bash
+#
+# SPDX-License-Identifier: GPL-2.0
+# (c) 2020,2022, Mathieu Poirier <mathieu.poirier@linaro.org>
+#
+# Takes the content of the positive and negative set and builds an input
+# set that can be processed by PatchNet model. The order of the positive
+# and negative file sets matters.
+#
+# USAGE:
+# $ patchnet-set -p positive.txt -n negative.txt output.txt (training set)
+# $ patchnet-set negative.txt output.txt (evaluation set)
+#
+# INPUT:
+# $1: Name of the positive set file generated by "positive-set"
+# $2: Name of the negative set file generated by "negative-set"
+# $3: Name of output file
+#
+
+usage() { echo "usage: patchnet-set [-p <positive.txt>] -n <negative.txt> <output.txt>" 1>&2; exit 1; }
+
+while getopts ":p:n:" opt; do
+ case ${opt} in
+ n) negative=${OPTARG}
+ ;;
+ p) positive=${OPTARG}
+ ;;
+ *)
+ usage
+ ;;
+ esac
+done
+shift $((OPTIND-1))
+
+if [ -z "${negative}" ]; then
+ usage
+fi
+
+# Make a backup of the output file if it already exists
+# After getopts above the last argument, i.e output.txt is in $1
+if [ -e "$1" ]; then
+ mv $1 $1.bkp
+fi
+
+if [ -z "${positive}" ]; then
+ cat $negative | parallel --link patchnet-write :::: - ::: false ::: $1
+else
+ cat $positive | parallel --link patchnet-write :::: - ::: true ::: $1
+ cat $negative | parallel --link patchnet-write :::: - ::: false ::: $1
+fi
+
+exit
+
diff --git a/patchnet-write b/patchnet-write
new file mode 100755
index 0000000..e18a403
--- /dev/null
+++ b/patchnet-write
@@ -0,0 +1,19 @@
+#!/bin/bash
+#
+# SPDX-License-Identifier: GPL-2.0
+# (c) 2020,2022, Mathieu Poirier <mathieu.poirier@linaro.org>
+#
+# Outputs a SHA1 in a way that it can be digested by the PatchNet model
+#
+# USAGE:
+# cat positive-set.txt | parallel --link patchnet-write :::: - ::: true ::: $1
+# cat negative-set.txt | parallel --link patchnet-write :::: - ::: false ::: $1
+#
+# INPUT:
+# $1: A git generated SHA1, i.e "39030e1351aa security: Add LSM hooks to set*gid syscalls"
+# $2: true if $1 should be a stable patch, false otherwise
+# $3: Name of the output file
+#
+
+sha1=$(echo $1 | awk {'print $1'})
+echo $sha1: $2 >> $3
diff --git a/positive-set b/positive-set
new file mode 100755
index 0000000..13dc57e
--- /dev/null
+++ b/positive-set
@@ -0,0 +1,60 @@
+#!/bin/bash
+#
+# SPDX-License-Identifier: GPL-2.0
+# (c) 2020,2022, Mathieu Poirier <mathieu.poirier@linaro.org>
+#
+# Goes through all the commits in @stable and outputs their mainline equivalent
+# Since the same commit can be found in more than one tree, the script also
+# checks for redundency.
+#
+# USAGE:
+# $ positive-set <outputfilename.out>
+#
+# INPUT:
+# $1: Name of the output file
+#
+
+if [ $# -ne 1 ]; then
+ echo "Script only takes one argument"
+ exit 1
+fi
+
+# Make a backup of the output file if it already exists
+if [ -e "$1" ]; then
+ mv $1 $1.bkp
+fi
+
+# Needed by 'grep' command in get-mainline-commit script
+touch $1
+
+stable=(stable/linux-stable.git/linux-5.4.y
+ stable/linux-stable.git/linux-4.19.y
+ stable/linux-stable.git/linux-4.14.y
+ stable/linux-stable.git/linux-4.9.y
+ stable/linux-stable.git/linux-4.4.y
+ stable/linux-stable.git/linux-3.16.y)
+
+for tree in "${stable[@]}"; do
+ # Get the top commit in each stable tree
+ top_commit=$(git log -1 --format="%H" $tree)
+
+ # Theoretically the top commit has a tag - skip to the next
+ # tree if a tag can't be found.
+ $(git describe --exact-match $top_commit) &> /dev/null
+ if [ $? -eq 0 ]; then
+ echo "Skipping $tree, can't find stable tag"
+ continue
+ fi
+
+ # Get the base and top commits
+ # $top_tag will end up being something like 4.18.23
+ # $base_tag will end up being something like 4.18
+ top_tag=$(git describe --exact-match $top_commit)
+ base_tag=${top_tag%.*}
+
+ echo Processing $base_tag to $top_tag
+
+ # For each commit, get the mainline equivalent
+ git log --format="%H" $base_tag..$top_tag | parallel --link get-mainline-commit :::: - ::: $1
+done
+
diff --git a/testing-set b/testing-set
new file mode 100755
index 0000000..0235a52
--- /dev/null
+++ b/testing-set
@@ -0,0 +1,50 @@
+#!/bin/bash
+#
+# SPDX-License-Identifier: GPL-2.0
+# (c) 2020,2022, Mathieu Poirier <mathieu.poirier@linaro.org>
+#
+# Finds all the commits within the commit range(s) that have not been pack
+# ported to a stable kernel tree.
+#
+# USAGE:
+# $ testing-set <outputfilename.out> <commit range> <commit range> ...
+#
+# INPUT:
+# $1: Name of the ouput file
+# $2: A valid kernel commit range, i.e v5.0..v5.2 or bcf876870b95..65550098c1c4
+#
+
+empty=$(mktemp)
+commits=$(mktemp)
+
+if [ $# -lt 2 ]; then
+ echo "Script needs at least two arguments"
+ exit 1
+fi
+
+# Make a backup of the output file if it already exists
+if [ -e "$1" ]; then
+ mv $1 $1.bkp
+fi
+
+touch $empty $commits
+
+declare -i count=1
+
+for arg in "$@"; do
+ # Skip over the filename argument
+ if [ $count -lt 2 ]; then
+ count+=1
+ continue
+ fi
+ echo processing $arg
+ git log --no-merges --no-renames --format="%H" $arg | parallel --link get-mainline-feature :::: - ::: $empty ::: $commits
+done
+
+cat $commits | parallel --link patchnet-write :::: - ::: false ::: $1
+
+rm $empty $commits
+
+exit
+
+