-
Notifications
You must be signed in to change notification settings - Fork 340
/
find_missing_files_in_sequence.py
executable file
·242 lines (212 loc) · 9.99 KB
/
find_missing_files_in_sequence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
#!/usr/bin/env python3
# coding=utf-8
# vim:ts=4:sts=4:sw=4:et
#
# Author: Hari Sekhon
# Date: 2020-07-31 11:03:17 +0100 (Fri, 31 Jul 2020)
#
# https://github.com/HariSekhon/DevOps-Python-tools
#
# License: see accompanying Hari Sekhon LICENSE file
#
# If you're using my code you're welcome to connect with me on LinkedIn
# and optionally send me feedback to help steer this or other code I publish
#
# https://www.linkedin.com/in/HariSekhon
#
"""
Finds missing files by numeric sequence, assuming a uniformly numbered file naming convention across files
Files / directories are given as arguments or via standard input
Directories are recursed and their files examined for missing numbers before each one
Only supply files / directories that should be sharing a contiguously numbered file naming convention in each
single run of this tool
Accounts for zero padding in numbered files
Caveats:
- This is more complicated than you'd first think as there are so many file naming variations that no code could ever
be universally bulletproof and will likely require advanced regex tuning to match your use case and naming convention
- Won't detect missing files higher than the highest numbered file as there is no way to know how many there should be.
If you are looking for missing MP3 files, then you might be able to check the mp3 tag metadata using programs like
'mediainfo' to get the total number of tracks and see if the files go that high
- Returns globs by default instead of explicit missing filenames since suffixes can vary after numbers. If you have a
simple enough use case with a single fixed filename convention such as 'blah_01.txt' then you can find code to print
the missing files more explicitly, but in the general case you cannot account for suffix naming that isn't consistent,
such as chapters of audiobooks eg.
'blah 01 - chapter about X.mp3'
'blah 02 - chapter about Y.mp3'
so in the general case you cannot always infer suffixes, hence why it is left as globs. If you are sure that the
suffixes don't change then you can specify --fixed-suffix and it will infer each file's suffix as the basis for any
numerically missing files in the sequence, but if used where this is not the case, it'll generate a lot of false
positives that the default globbing mode would have handled
- Doesn't currently find entire missing CD / disks in the naming format, but you should be able to see those cases
easily by eye
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import glob
#import logging
import os
import re
import sys
import traceback
srcdir = os.path.abspath(os.path.dirname(__file__))
libdir = os.path.join(srcdir, 'pylib')
sys.path.append(libdir)
try:
from harisekhon.utils import log, log_option, validate_regex, isInt, UnknownError
from harisekhon import CLI
except ImportError as _:
print(traceback.format_exc(), end='')
sys.exit(4)
__author__ = 'Hari Sekhon'
__version__ = '0.3.2'
# pylint: disable=too-many-instance-attributes
class FindMissingFiles(CLI):
def __init__(self):
# Python 2.x
super(FindMissingFiles, self).__init__()
# Python 3.x
# super().__init__()
self.paths = []
self.regex_default = r'(?<!dis[ck]\s)' + \
r'(?<!CD\s)' + \
r'(?<!-)' + \
r'(?<!-\d)' + \
r'(?<!-\d\d)' + \
r'(?<!0)' + \
r'(?<!\d\.)' + \
r'(?<!\.mp)' + \
r'(\d+)' + \
r'(?![\w,@-])' + \
r'(?!\.\d)'
self.exclude_default = r'^\d+\s'
self.regex = None
self.include = None
self.exclude = None
self.fixed_suffix = False
self.missing_files = []
def add_options(self):
super(FindMissingFiles, self).add_options()
self.add_opt('-r', '--regex', metavar='REGEX', default=self.regex_default,
help='Regex capture of the portion of the filename to compare ' + \
'- must have capture brackets capturing an integer ' + \
'(default: "{}" )'\
.format(self.regex_default))
self.add_opt('-i', '--include', metavar='REGEX',
help=r"Include only paths that match the given case-insensitive regex (eg. '\.mp3$')")
self.add_opt('-e', '--exclude', metavar='REGEX', default=self.exclude_default,
help='Exclude paths that match the given case-insensitive regex (default: "{}" )'\
.format(self.exclude_default))
self.add_opt('-s', '--fixed-suffix', action='store_true',
help='Assume fixed suffixes and infer explicit filenames rather than globs. The reason this ' + \
'is not the default is that if this is not the case and there is some variation in ' + \
'suffixes, such as with audiobook chapters, then you will hit a lot of false positives ' + \
'that would have been caught by globbing')
def process_options(self):
super(FindMissingFiles, self).process_options()
self.regex = self.get_opt('regex')
self.include = self.get_opt('include')
self.exclude = self.get_opt('exclude')
self.fixed_suffix = self.get_opt('fixed_suffix')
validate_regex(self.regex)
self.regex = re.compile('(.*?)' + self.regex + '(.*)', re.I)
if self.include is not None:
validate_regex(self.include)
self.include = re.compile(self.include, re.I)
if self.exclude is not None:
validate_regex(self.exclude)
self.exclude = re.compile(self.exclude, re.I)
if self.args:
self.paths = self.args
else:
self.paths = sys.stdin.readlines()
log_option('paths', self.paths)
def is_included(self, path):
if not self.include:
return True
if self.include.search(path):
log.debug("including path: %s", path)
return True
return False
def is_excluded(self, path):
if not self.exclude:
return False
if self.exclude.search(path):
log.debug("excluding path: %s", path)
return True
return False
def run(self):
for path in self.paths:
if self.is_excluded(path):
continue
if not self.is_included(path):
continue
if not os.path.exists(path):
raise UnknownError('path not found: {}'.format(path))
if os.path.isdir(path):
self.process_directory(directory=path)
elif os.path.isfile(path):
self.check_file(filename=path)
def process_directory(self, directory):
for root, dirs, files in os.walk(directory, topdown=True):
for filename in files:
file_path = os.path.join(root, filename)
if not self.is_included(file_path):
continue
if self.is_excluded(file_path):
continue
self.check_file(filename=file_path)
for dirname in dirs:
dir_path = os.path.join(root, dirname)
if not self.is_included(dir_path):
continue
if self.is_excluded(dir_path):
continue
# massive depth directories will hit a recursion limit here but this is very rare in the real world
# and probably a sign the filesystem should be better structured
self.process_directory(directory=dir_path)
def check_file(self, filename):
log.debug('checking file \'%s\'', filename)
match = self.regex.search(os.path.basename(filename))
if not match:
log.debug('no numeric regex match for file, probably not a sequential file' + \
', skipping \'%s\'', filename)
return
# will error out here if you've supplied your own regex without capture brackets
# or if you've got pre-captures - let this bubble to user to fix their regex
file_prefix = os.path.join(os.path.dirname(filename), match.group(1))
file_number = match.group(2)
file_suffix = match.group(3)
if not isInt(file_number):
raise UnknownError('regex captured non-float for filename: {}'.format(filename))
if file_prefix is None:
file_prefix = ''
if file_suffix is None:
file_suffix = ''
padding = len(file_number)
file_number = int(file_number)
while file_number > 1:
file_number = self.determine_missing_file_backfill(file_prefix, file_number, padding, file_suffix)
if self.missing_files:
print('\n'.join(reversed(self.missing_files)))
self.missing_files = []
def determine_missing_file_backfill(self, file_prefix, file_number, padding, file_suffix):
file_number -= 1
if self.fixed_suffix:
explicit_last_filename = '{}{:0>%(padding)s}{}' % {'padding': padding}
explicit_last_filename = explicit_last_filename.format(file_prefix, file_number, file_suffix)
if not os.path.isfile(explicit_last_filename):
self.missing_files.append(explicit_last_filename)
else:
file_number = -1
else:
expected_last_filename_glob = '{}{:0>%(padding)s}*' % locals()
expected_last_filename_glob = expected_last_filename_glob.format(file_prefix, file_number)
if not glob.glob(expected_last_filename_glob):
self.missing_files.append(expected_last_filename_glob)
else:
file_number = -1
return file_number
if __name__ == '__main__':
FindMissingFiles().main()