matching = require '../lib/matching'
scoring = require '../lib/scoring'
fs = require 'fs'
byline = require 'byline'
sprintf = require('sprintf-js').sprintf
check_usage = () ->
usage = '''
Run a frequency count on the raw 10M xato password set and keep counts over CUTOFF in
descending frequency. That file can be found by googling around for:
"xato 10-million-combos.txt"
Passwords that both:
-- fully match according to zxcvbn's date, year, repeat, sequence or keyboard matching algs
-- have a higher rank than the corresponding match guess number
are excluded from the final password set, since zxcvbn would score them lower through
other means anyhow. in practice this rules out dates and years most often and makes room
for more useful data.
To use, first run from zxcvbn base dir:
npm run build
then change into data-scripts directory and run:
coffee count_xato.coffee --nodejs xato_file.txt ../data/passwords.txt
'''
valid = process.argv.length == 5
valid = valid and process.argv[0] == 'coffee' and process.argv[2] in ['--nodejs', '-n']
valid = valid and __dirname.split('/').slice(-1)[0] == 'data-scripts'
unless valid
console.log usage
process.exit(0)
# after all passwords are counted, discard pws with counts <= COUNTS
CUTOFF = 10
# to save memory, after every batch of size BATCH_SIZE, go through counts and delete
# long tail of entries with only one count.
BATCH_SIZE = 1000000
counts = {} # maps pw -> count
skipped_lines = 0 # skipped lines in xato file -- lines w/o two tokens
line_count = 0 # current number of lines processed
normalize = (token) ->
token.toLowerCase()
should_include = (password, xato_rank) ->
for i in [0...password.length]
if password.charCodeAt(i) > 127
# xato mostly contains ascii-only passwords, so in practice
# this will only skip one or two top passwords over the cutoff.
# were that not the case / were this used on a different data source, consider using
# a unidecode-like library instead, similar to count_wikipedia / count_wiktionary
console.log "SKIPPING non-ascii password=#{password}, rank=#{xato_rank}"
return false
matches = []
for matcher in [
matching.spatial_match
matching.repeat_match
matching.sequence_match
matching.regex_match
matching.date_match
]
matches.push.apply matches, matcher.call(matching, password)
matches = matches.filter (match) ->
# only keep matches that span full password
match.i == 0 and match.j == password.length - 1
for match in matches
if scoring.estimate_guesses(match, password) < xato_rank
# filter out this entry: non-dictionary matching will assign
# a lower guess estimate.
return false
return true
prune = (counts) ->
for pw, count of counts
if count == 1
delete counts[pw]
main = (xato_filename, output_filename) ->
stream = byline.createStream fs.createReadStream(xato_filename, encoding: 'utf8')
stream.on 'readable', ->
while null != (line = stream.read())
line_count += 1
if line_count % BATCH_SIZE == 0
console.log 'counting tokens:', line_count
prune counts
tokens = line.trim().split /\s+/
unless tokens.length == 2
skipped_lines += 1
continue
[username, password] = tokens[..1]
password = normalize password
if password of counts
counts[password] += 1
else
counts[password] = 1
stream.on 'end', ->
console.log 'skipped lines:', skipped_lines
pairs = []
console.log 'copying to tuples'
for pw, count of counts
if count > CUTOFF
pairs.push [pw, count]
delete counts[pw] # save memory to avoid v8 1GB limit
console.log 'sorting'
pairs.sort (p1, p2) ->
# sort by count. higher counts go first.
p2[1] - p1[1]
console.log 'filtering'
pairs = pairs.filter (pair, i) ->
rank = i + 1
[pw, count] = pair
should_include pw, rank
output_stream = fs.createWriteStream output_filename, encoding: 'utf8'
for pair in pairs
[pw, count] = pair
output_stream.write sprintf("%-15s %d\n", pw, count)
output_stream.end()
check_usage()
main process.argv[3], process.argv[4]