We are unlikely to be sending XHTML, or want to be.
[adorno.git] / scripts / identify-dups.pl
blob02da2e5c2e22258d84be247c1a2265f4e22e6888
1 #!/usr/bin/perl -w
3 # This script takes as an input the output of a command like
5 # find /data -type f -exec md5sum {} \;
7 # and generates a script to remove duplicates, optionally creating symlinks.
9 use Digest::MD5;
11 use MP3::Info;
12 use Ogg::Vorbis::Header;
14 use String::ShellQuote;
16 $totalFiles = 0;
17 $totalLinked = 0;
18 $totalRemoved = 0;
20 #The master patterns are a prioritised list that indicates where the master copy of a file should be kept.
22 @findMasterPatterns = ();
23 push @findMasterPatterns, '^/data/public/Music/Production/Kevin/';
24 push @findMasterPatterns, '^/data/public/Music/Production/Andrew/';
25 push @findMasterPatterns, '^/data/public/Music/Production/Grouped/';
26 push @findMasterPatterns, '^/data/public/Photos/Structured/';
27 push @findMasterPatterns, '^/data/public/Christopher/';
28 push @findMasterPatterns, '^/data/public/Music/nietzche/';
29 push @findMasterPatterns, '^/data/public/Music/Production/Karl/';
30 push @findMasterPatterns, '^/data/public/Photos/Lost-Dates/';
31 push @findMasterPatterns, '^/data/public/Music/Danny_Todd/';
32 push @findMasterPatterns, '^/data/public/Music/Production/Daniel/';
33 push @findMasterPatterns, '^/data/public/Music/Production/Simon/Pop/';
34 push @findMasterPatterns, '^/data/public/movies/TV Episodes/';
35 push @findMasterPatterns, '^/data/public/Music/Production/01 Singles .Graydon./';
36 push @findMasterPatterns, '^/data/public/Music/Production/01 Singles .Peter./';
37 push @findMasterPatterns, '^/data/public/Erica/fromwork/Thesis/Histories of the discipline/';
38 push @findMasterPatterns, '^/data/public/Erica/fromwork/Thesis/Historis of the University in NZ/';
39 #push @findMasterPatterns, '';
40 #push @findMasterPatterns, '';
42 #The link patterns are a specification of which directories should have links generated - i.e. the original file
43 #names might be referenced. . should always create links.
45 @linkPatterns = ();
46 push @linkPatterns, '^/data/public/Music/Production/01 Singles .Erica./';
47 push @linkPatterns, '^/data/public/Christopher/';
49 #print "@findMasterPatterns findMasterPatterns loaded.\n";
50 #print "@linkPatterns linkPatterns loaded.\n";
52 open(SCRIPT,"> script.sh") || die("can't open script.sh: $!");
53 print SCRIPT "#!/bin/sh\n";
54 open(MD5s,'md5sorted.txt') || die("can't open md5sorted.txt: $!");
56 my $numCount = 0;
57 my @tmp = (0,0);
58 my $lastMD5 = '';
59 my $newMD5 = 'xxx';
60 my $fileName = undef;
61 my $leftOver = undef;
62 my @dataStore = undef;
63 my $firstOne = 1;
65 sub loadData
67 $numCount++;
68 $dataStore[$numCount]="/" . $fileName;
69 if ($leftOver) { print "UNEXPECTED SPLIT VALUE FOR HASH $newMD5 RESIDUAL $leftOver\n" }
72 sub resetStore
74 $totalFiles += $numCount;
75 $numCount = 0;
76 @dataStore = undef;
77 $dataStore[0] = $newMD5;
78 loadData();
81 while ( <MD5s> ) {
82 $lastMD5 = $newMD5;
83 chomp;
84 ($newMD5,$fileName,$leftOver) = split m! /!;
85 if ($lastMD5 eq $newMD5 || $firstOne)
87 #just load into the main data hash
88 $firstOne = 0;
89 loadData();
91 elsif ($numCount == 1)
93 #It's different, but there was only one of them
94 #Because we're processing duplicates, no action needed
95 print SCRIPT "#IGNORE $dataStore[1]\n";
96 resetStore();
98 else {
99 #actually do the work - this is split out as the input file will end, and we still want to process in that case
100 #note the last action is still to read resetStore
101 nextAndLast();
105 if ($numCount > 1) { nextAndLast(); }
107 print "In this number of files: $totalFiles\n";
108 print "Linked: $totalLinked\n";
109 print "Removed: $totalRemoved\n";
110 if ($totalFiles) {
111 printf "Tidied percentage %2.1f\n", ($totalLinked + $totalRemoved) / $totalFiles * 100;
114 sub nextAndLast()
116 #process the records associated with the previous md5sum, which are in @dataStore[1 to $numCount]
118 #find the one to keep
120 $oneToKeep = 0;
121 $patternID = 0;
123 until ($oneToKeep or $patternID == @findMasterPatterns) {
124 $patternID++;
125 $i = 0;
126 until ($oneToKeep or $i == $numCount) {
127 $i++;
128 if ($dataStore[$i] =~ $findMasterPatterns[$patternID-1]) {$oneToKeep = $i;} ;
132 if (@dataStore) { #if not, then first run
133 unless ($oneToKeep) {
134 die "Could not find one to keep for @dataStore"
138 #symlink or delete the others based on directory
140 for ($i=1;$i<$numCount+1;$i++)
142 if ($i == $oneToKeep) #if this is the one to keep
144 print SCRIPT "#RETAIN $dataStore[$i]\n";
146 else
148 $done = 0;
149 $j = 0;
150 until ($done or $j==@linkPatterns)
152 $j++;
153 if ($dataStore[$i] =~ $linkPatterns[$j-1]) {
154 @temp = shell_quote ("rm", "$dataStore[$i]");
155 @temp2 = shell_quote ("ln", "-s", "$dataStore[$oneToKeep]", "$dataStore[$i]") ;
156 print SCRIPT "@temp && @temp2\n";
157 $done = 1;
158 $totalLinked++;
162 unless ($done) {
163 @temp = shell_quote ("rm", "$dataStore[$i]");
164 print SCRIPT "@temp\n";
165 $totalRemoved++;
170 # And reset the data store etc.
171 resetStore();