summaryrefslogtreecommitdiff
path: root/bin/dirty-db-cleaner.py
blob: 8ed9c50655ba8f9d4b5ad4a2f75afba797cea384 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#!/usr/bin/python -u
#
# Created by Bjarni R. Einarsson, placed in the public domain. Go wild!
#
import json
import os
import sys

try:
    dirtydb_input = sys.argv[1]
    dirtydb_output = '%s.new' % dirtydb_input
    assert(os.path.exists(dirtydb_input))
    assert(not os.path.exists(dirtydb_output))
except:
    print 
    print 'Usage: %s /path/to/dirty.db' % sys.argv[0]
    print 
    print 'Note: Will create a file named dirty.db.new in the same folder,'
    print '      please make sure permissions are OK and a file by that'
    print '      name does not exist already. This script works by omitting'
    print '      duplicate lines from the dirty.db file, keeping only the'
    print '      last (latest) instance. No revision data should be lost,'
    print '      but be careful, make backups. If it breaks you get to keep'
    print '      both pieces!'
    print
    sys.exit(1)

dirtydb = {}
lines = 0
with open(dirtydb_input, 'r') as fd:
    print 'Reading %s' % dirtydb_input
    for line in fd:
        lines += 1
        data = json.loads(line)
        dirtydb[data['key']] = line
        if lines % 10000 == 0:
            sys.stderr.write('.')
print
print 'OK, found %d unique keys in %d lines' % (len(dirtydb), lines)

with open(dirtydb_output, 'w') as fd:
    for data in dirtydb.values():
        fd.write(data)

print 'Wrote data to %s. All done!' % dirtydb_output