-
Notifications
You must be signed in to change notification settings - Fork 0
/
dedupe-flac.sql
130 lines (95 loc) · 5.09 KB
/
dedupe-flac.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
-- first import the text file produced from listing files, their path and FLAC md5sum using bash terminal:
-- In a bash terminal:
echo __path:__md5sig > md5sums_target.txt
find -type f -name \*.flac -print0 | xargs -0 -n1 -P4 metaflac --with-filename --show-md5sum >> md5sums.txt
-- add the necessary delimiters to be able to import the file into a sqlite table.
echo __path:__md5sig:__filename:__dirpath > import.csv && sed ':a;N;$!ba;s/\n/:::::::::::::::::::::::::::::::::::::::::::::::::::::::::\n/g' md5sums.txt >> import.csv
echo __path:__md5sig:__filename:__dirpath > import.csv && sed ':a;N;$!ba;s/\n/::\n/g' md5sums.txt >> import.csv
-- now import the text file produced from listing files, their path and FLAC md5sum
-- in SQLite:
DROP TABLE IF EXISTS audio;
CREATE TABLE audio (
__path TEXT,
__md5sig TEXT,
__filename TEXT,
__dirpath TEXT
);
-- start SQL processing
--
--
--
-- derive filename from the full file path
UPDATE audio
SET __filename = [replace](__path, rtrim(__path, [replace](__path, "/", "") ), "");
--
-- derive __dirpath from the full file path
UPDATE audio
SET __dirpath = substr(__path, 1, length(__path) - length([replace](__path, rtrim(__path, [replace](__path, "/", "") ), "") ) );
--
-- create table in which to store concatenated __md5sig for all __dirnames
DROP TABLE IF EXISTS __dirpath_content_concat__md5sig;
CREATE TABLE __dirpath_content_concat__md5sig (
__dirpath TEXT,
concat__md5sig TEXT
);
--
-- populate table with __dirpath and concatenated __md5sig of all files associated with __dirpath (note order by __md5sig to ensure concatenated __md5sig is consistently generated irrespective of physical record sequence).
INSERT INTO __dirpath_content_concat__md5sig (
__dirpath,
concat__md5sig
)
SELECT __dirpath,
group_concat(__md5sig, " | ")
FROM (
SELECT __dirpath,
__md5sig
FROM audio
ORDER BY __dirpath,
__md5sig
)
GROUP BY __dirpath;
--
-- create table in which to store all __dirnames with identical FLAC contents (i.e. the __md5sig of each FLAC in folder is concatenated and compared)
DROP TABLE IF EXISTS __dirpaths_with_same_content;
CREATE TABLE __dirpaths_with_same_content (
__dirpath TEXT,
concat__md5sig TEXT
);
--
--now write the duplicate records into a separate table listing all __dirname's that have identical FLAC contents
INSERT INTO __dirpaths_with_same_content (
__dirpath,
concat__md5sig
)
SELECT __dirpath,
concat__md5sig
FROM __dirpath_content_concat__md5sig
WHERE concat__md5sig IN (
SELECT concat__md5sig
FROM __dirpath_content_concat__md5sig
GROUP BY concat__md5sig
HAVING count( * ) > 1
)
ORDER BY concat__md5sig,
__dirpath;
--
-- create table for listing directories in which FLAC files should be deleted as they're duplicates
DROP TABLE IF EXISTS __dirpaths_with_FLACs_to_kill;
CREATE TABLE __dirpaths_with_FLACs_to_kill (
__dirpath TEXT,
concat__md5sig TEXT
);
--
-- populate table listing directories in which FLAC files should be deleted as they're duplicates
INSERT INTO __dirpaths_with_FLACs_to_kill (
__dirpath,
concat__md5sig
)
SELECT __dirpath,
concat__md5sig
FROM __dirpaths_with_same_content
WHERE rowid NOT IN (
SELECT min(rowid)
FROM __dirpaths_with_same_content
GROUP BY concat__md5sig
);