chromium/components/subresource_filter/tools/filter_many.sh

#!/bin/sh

# Copyright 2019 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

# This takes a directory of gzip files that contain all of the url requests
# from the HTTP Archive and runs them against the given filter list, showing
# how many times each filter rule matched in descending order. This script
# is part of the process of building a small filter list, as documented
# in components/subresource_filter/FILTER_LIST_GENERATION.md.

# Example usage, from the directory that contains the gzip files:
# bash filter_many.sh 8 . ~/chromium/src/out/Release/subresource_filter_tool \
#   easylist_indexed > sorted_list

# The number of processes you want to run in parallel. 8 is reasonable for a
# typical machine. 80 is good for a powerful workstation. If 0 is specified,
# this script uses as many as possible.
PROCESS_COUNT=$1

# The path to the directory that contains gzip files of resource requests from
# http archive.
GZIP_PATH=$2

# The path to the filter_tool binary.
FILTER_TOOL=$3

# The path to the indexed easylist file.
EASYLIST=$4

# Create temporary directory.
TEMP_DIR=$(mktemp -d)

# For each gzip file:
ls $GZIP_PATH/*.gz |

# In parallel, unzip the file and count the number of times each rule matches.
# The results are saved to independent temporary files to ensure that writes
# aren't interleaved mid-rule.
xargs -t -I {} -P $PROCESS_COUNT \
  sh -c "gunzip -c {} | \
         $FILTER_TOOL --ruleset=$EASYLIST match_rules \
         > \$(mktemp $TEMP_DIR/output.XXXXXXXXXX)"

# Aggregate the results from those files.
cat $TEMP_DIR/output.* |

# Sort the results by filter rule.
sort -k 2 |

# Combine the matches for the same rule.
awk 'NR>1 && rule!=$2 {print count,rule; count=0} {count+=$1} {rule=$2} \
  END {print count,rule}' |

# Sort the output in descending order by match count.
sort -n -r

# Delete the temporary folder.
rm -rf $TEMP_DIR