raspbmirror.py

#!/usr/bin/env python3

# Copyright 2018 Peter Green
# Released under the MIT/Expat license, see doc/COPYING

import os
import sys
import hashlib
import gzip
import stat
#from sortedcontainers import SortedDict
#from sortedcontainers import SortedList
from collections import deque
from collections import OrderedDict
from datetime import datetime
from email.utils import parsedate_to_datetime
import argparse
import re
from heapq import heappush, heappop
import fcntl

parser = argparse.ArgumentParser(description="mirror raspbian repo.")
parser.add_argument("baseurl", help="base url for source repo (e.g. https://archive.raspbian.org/ )",nargs='?')
parser.add_argument("mdurl", help="base url for mirrordirector or local source mirror (e.g. https://mirrordirector.raspbian.org/ )",nargs='?')
parser.add_argument("hpurl", help="base url for last result hash pool (e.g. http://snapshot.raspbian.org/hashpool )",nargs='?')

parser.add_argument("--internal", help=argparse.SUPPRESS) #base URL for private repo (internal use only)
parser.add_argument("--sourcepool", help="specify a source pool to look for packages in before downloading them (useful if maintaining multiple mirrors)",action='append')
parser.add_argument("--tmpdir", help="specify a temporary directory to avoid storing temporary files in the output tree, must be on the same filesystem as the output tree")

#debug option to set the index file used for the "downloadnew" phase but not the "finalize" phase, used to test error recovery.
parser.add_argument("--debugfif", help=argparse.SUPPRESS)
#debug option to set the source url used to download "dists" files during the "downloadnew" phase, used to test error recovery.
parser.add_argument("--debugfdistsurl", help=argparse.SUPPRESS)

parser.add_argument("--tlwhitelist", help="specify comma-seperated whitelist of top-level directories")

parser.add_argument("--cleanup",help="scan for and remove files not managed by raspbmirror from mirror tree", action="store_true")

parser.add_argument("--debugskippool",help="skip downloading pool data, only download metadata (for debugging)",action="store_true")

parser.add_argument("--distswhitelist", help="specify comman seperated list of distributions")

parser.add_argument("--nolock", help="don't try to lock the target directory", action="store_true")

parser.add_argument("--repair", help="during mirroring, verify that all on-disk files match the expected sha256", action="store_true")

parser.add_argument("--urllib", help="force usage of the builtin urllib module, even if urllib3 is present", action="store_true")

parser.add_argument("--urllib3", help="force usage of the urllib3 module, panics if the dependency is missing", action="store_true")

parser.add_argument("--ipv4", help="force usage of IPv4 addresses. Requires urllib3", action="store_true")

parser.add_argument("--ipv6", help="force usage of IPv6 addresses. Requires urllib3", action="store_true")

args = parser.parse_args()

if not args.nolock:
	lockfd = os.open('.',os.O_RDONLY)
	fcntl.flock(lockfd,fcntl.LOCK_EX | fcntl.LOCK_NB)

if args.urllib and args.urllib3:
	print("error: flags --urllib and --urllib3 are in conflict")
	exit(1)

if args.urllib:
	import urllib.request
	use_urllib3 = False
elif args.urllib3:
	import urllib3
	use_urllib3 = True
else:
	# auto detect urllib3
	try:
		import urllib3
		use_urllib3 = True
	except:
		import urllib.request
		use_urllib3 = False

if args.ipv4 and args.ipv6:
	print("error: flags --ipv4 and --ipv6 are in conflict")
	exit(1)

if use_urllib3:
	# the number of pools should be greater than the number of concurrently used sites.
	# 10 should be safe.
	dlmanager = urllib3.PoolManager(num_pools=10)
	print("info: using urllib3")

	# a fairly hacky way to force the usage of ipv4 or ipv6 addresses
	# https://stackoverflow.com/questions/33046733/force-requests-to-use-ipv4-ipv6
	if args.ipv4:
		import socket
		import requests.packages.urllib3.util.connection as urllib3_cn
		def allowed_gai_family():
			return socket.AF_INET
		urllib3_cn.allowed_gai_family = allowed_gai_family
	elif args.ipv6:
		import socket
		import requests.packages.urllib3.util.connection as urllib3_cn
		def allowed_gai_family():
			return socket.AF_INET6
		urllib3_cn.allowed_gai_family = allowed_gai_family
else:
	print("info: using urllib")
	if args.ipv4:
		print("error: flag --ipv4 requires the urllib3 package")
		exit(1)
	elif args.ipv6:
		print("error: flag --ipv6 requires the urllib3 package")
		exit(1)

def addfilefromdebarchive(filestoverify,filequeue,filename,sha256,size):
	size = int(size)
	sha256andsize = [sha256,size,'M']
	if filename in filestoverify:
		if (sha256andsize[0:2] != filestoverify[filename][0:2]):
			if stage == 'scanexisting':
				print('warning: same file with different hash/size during scanexisting phase old:'+repr(filestoverify[filename])+' new:'+repr(sha256andsize))
				#find existing sha1/size of file on disk if it exists
				if os.path.isfile(filename):
					f = open(filename,'rb')
					data = f.read()
					f.close()
					sha256hash = hashlib.sha256(data)
					sha256hashed = sha256hash.hexdigest().encode('ascii')
					size = len(data)
				else:
					#otherwise we have no idea
					sha256 = None
					size = None
				filestoverify[filename] = [sha256,size,'M']
			else:
				print('error: same file with different hash/size during downloadnew phase old:'+repr(filestoverify[filename])+' new:'+repr(sha256andsize))
				sys.exit(1)
	else:
		filestoverify[filename] = sha256andsize
		addtofilequeue(filequeue,filename)

def addtofilequeue(filequeue,filename):
	filenamesplit = filename.split(b'/')
	if b'dists' in filenamesplit:
		if filename.endswith(b'.gz'):
			# process gz files with high priority so they can be used as substitutes for their uncompressed counterparts
			heappush(filequeue,(10,filename))
		else:
			heappush(filequeue,(20,filename))
	heappush(filequeue,(30,filename))


#regex used for filename sanity checks
pfnallowed = re.compile(b'[a-z0-9A-Z\-_:\+~\.]+',re.ASCII)
shaallowed = re.compile(b'[a-z0-9]+',re.ASCII)

def ensuresafepath(path):
	pathsplit = path.split(b'/')
	if path[0] == '/':
		print("path must be relative")
		sys.exit(1)
	for component in pathsplit:
		if not pfnallowed.fullmatch(component):
			print("component "+ascii(component)+" in path "+ascii(path)+" contains unexpected characters")
			sys.exit(1)
		elif component[0] == '.':
			print("filenames starting with a dot are not allowed")
			sys.exit(1)
	
def geturl(fileurl):
    if use_urllib3:
        response = dlmanager.request("GET", fileurl.decode('ascii'))
        ts = getts(fileurl, response)
        return (response.data, ts)
    else:
        with urllib.request.urlopen(fileurl.decode('ascii')) as response:
            data = response.read()
        ts = getts(fileurl, response)
        return (data, ts)

def getts(fileurl, response):
	if fileurl[:7] == b'file://':
		ts = os.path.getmtime(fileurl[7:])
	else:
		dt = parsedate_to_datetime(response.getheader('Last-Modified'))
		if dt.tzinfo is None:
			dt = dt.replace(tzinfo=timezone.utc)
		ts = dt.timestamp()
	return ts


def makenewpath(path):
	if args.tmpdir is None:
		return path+b'.new'
	else:
		return os.path.join(args.tmpdir.encode('ascii'),(path+b'.new').replace(b'/',b'~'))

def getfile(path,sha256,size):
	ensuresafepath(path)
	if not shaallowed.fullmatch(sha256):
		print('invalid character in sha256 hash')
		sys.exit(1)
	#hashfn = b'../hashpool/' + sha256[:2] +b'/'+ sha256[:4] +b'/'+ sha256
	#if os.path.isfile(hashfn):
	#	if os.path.getsize(hashfn) != size:
	#		print('size mismatch on existing file in hash pool')
	#		sys.exit(1)
	#else:
	#	secondhashfn = None
	#	if args.secondpool is not None:
	#		secondhashfn = os.path.join(args.secondpool.encode('ascii'),sha256[:2] +b'/'+ sha256[:4] +b'/'+ sha256)
	#		#print(secondhashfn)
	#		if not os.path.isfile(secondhashfn):
	#			secondhashfn = None
	#	if secondhashfn is None:
	#	else:
	#		print('copying '+path.decode('ascii')+' with hash '+sha256.decode('ascii')+' from secondary pool')
	#		f = open(secondhashfn,'rb')
	#		data = f.read()
	#		f.close()
	#		ts = os.path.getmtime(secondhashfn)
	#	sha256hash = hashlib.sha256(data)
	#	sha256hashed = sha256hash.hexdigest().encode('ascii')
	#	if (sha256 != sha256hashed):
	#		#print(repr(filesize))
	#		#print(repr(sha256))
	#		#print(repr(sha256hashed))
	#		print('hash mismatch while downloading file '+path.decode('ascii')+' '+sha256.decode('ascii')+' '+sha256hashed.decode('ascii'));
	#		sys.exit(1)
	#	if len(data) != size:
	#		print('size mismatch while downloading file')
	#		sys.exit(1)
	#	hashdir = os.path.dirname(hashfn)
	#	os.makedirs(hashdir,exist_ok=True)
	#	f = open(hashfn,'wb')
	#	f.write(data)
	#	f.close()
	#	            
	#	os.utime(hashfn,(ts,ts))
	if len(os.path.dirname(path)) > 0:
		os.makedirs(os.path.dirname(path),exist_ok=True)
	havenewfile = os.path.isfile(makenewpath(path))
	if havenewfile: # "new" file already exists, lets check the hash
		fn = makenewpath(path)
		sha256hashed, tl = getfilesha256andsize(fn)
		if (sha256 == sha256hashed) and (size == tl):
			print('existing file '+path.decode('ascii')+' matched by hash and size')
			fileupdates.add(path)
			return # no download needed but rename is
	if os.path.isfile(path): # file already exists
		if (size == os.path.getsize(path)): #no point reading the data and calculating a hash if the size does not match
			if (not args.repair) and (path in oldknownfiles) and (not havenewfile):
				#shortcut exit if file is unchanged, we skip this if a "new" file was detected because
				#that means some sort of update was going on to the file and may need to be finished/cleaned up.
				oldsha256,oldsize,oldstatus = oldknownfiles[path]
				if (oldsha256 == sha256) and (oldsize == size) and (oldstatus != 'F'):
					return # no update needed

			sha256hashed, tl = getfilesha256andsize(path)
			if (sha256 == sha256hashed) and (size == tl):
				print('existing file '+path.decode('ascii')+' matched by hash and size')
				if havenewfile:
					#if file is up to date but a "new" file exists and is bad
					#(we wouldn't have got this far if it was good)
					#schedule the "new" file for removal by adding it to "basefiles"
					basefiles.add(makenewpath(path))
				return  # no update needed
	if os.path.isfile(path): # file already exists
		fileupdates.add(path)
		if os.path.isfile(makenewpath(path)):
			os.remove(makenewpath(path))
		outputpath = makenewpath(path)
	else:
		outputpath = path
	pathsplit = path.split(b'/')
	if (pathsplit[1:2] == [b'pool']) and (args.debugskippool):
		print('skipping download of '+path.decode('ascii')+' because --debugskippool was specified')
		return
	if (args.internal is not None) and (pathsplit[0] == b'raspbian'):
		fileurl = args.internal.encode('ascii') +b'/private/' + b'/'.join(pathsplit[1:])
	else:
		fileurl = baseurl + b'/' + path
	data = None
	if args.sourcepool is not None:
		for sourcepool in args.sourcepool:
			#print(repr(args.sourcepool))
			#print(repr(sourcepool))
			sourcepool = sourcepool.encode('ascii')
			if (len(pathsplit) > 1) and (pathsplit[1] == b'pool'):
				spp = os.path.join(sourcepool,b'/'.join(pathsplit[2:]))
				if os.path.isfile(spp)  and (size == os.path.getsize(spp)):
					print('trying file from sourcepool '+spp.decode('ascii'))
					ts = os.path.getmtime(spp)
					[ sha256hashed, size ] = getfilesha256andsize(spp);
					if (sha256 != sha256hashed):
						#print(repr(filesize))
						#print(repr(sha256))
						#print(repr(sha256hashed))
						print('hash mismatch while trying file from sourcepool, ignoring file');
						data = None
						continue
					try:
						os.link(spp,outputpath)
						print('successfully hardlinked file to source pool')
					
					except:
						print('file in souce pool was good but hard linking failed, copying file instead')
						break
					fdownloads.write(outputpath+b'\n')
					fdownloads.flush()
					return
	if data is None:
		if path+b'.gz' in knownfiles:
			if path+b'.gz' in fileupdates:
				gzfile = makenewpath(path+b'.gz')
			else:
				gzfile = path+b'.gz'
			print('uncompressing '+gzfile.decode('ascii')+' with hash '+sha256.decode('ascii')+' to '+outputpath.decode('ascii'))
			f = gzip.open(gzfile)
			data = f.read()
			f.close()
			ts = os.path.getmtime(gzfile)
			if not checkdatahash(data, sha256, 'hash mismatch while uncompressing file ', path, ''):
				sys.exit(1)
			if len(data) != size:
				print('size mismatch while uncompressing file')
				sys.exit(1)

	#use slicing so we don't error if pathsplit only has one item
	if (data is None) and (mdurl is not None) and (pathsplit[1:2] == [b'pool']):

		fileurl = mdurl + b'/' + path
		#fileurl = mdurl + b'/' + b'/'.join(pathsplit[1:])
		data, ts = getandcheckfile(fileurl, sha256, size, path, outputpath, ' from mirrordirector',' trying main server instead')
	if data is None:

		if (args.internal is not None) and (pathsplit[0] == b'raspbian'):
			fileurl = args.internal.encode('ascii') +b'/private/' + b'/'.join(pathsplit[1:])
		elif (args.debugfdistsurl is not None) and (stage == 'downloadnew') and (b'dists' in pathsplit):
			fileurl = args.debugfdistsurl.encode('ascii') + b'/' + path
		else:
			fileurl = baseurl + b'/' + path
		data, ts = getandcheckfile(fileurl, sha256, size, path, outputpath, '','')
	if data is None:
		if (stage == 'downloadnew') and (b'dists' not in pathsplit):
			print('continuing dispite download failure of '+path.decode('ascii')+', may revisit later')
			global dlerrorcount
			dlerrorcount += 1
			knownfiles[path][2] = 'F'
			return
	if (data is None) and (hpurl is not None):
		print('failed to get '+path.decode('ascii')+' from normal sources, trying hash pool')
		ensuresafepath(sha256)
		fileurl = hpurl + b'/' + sha256[0:2] + b'/' + sha256[0:4] + b'/' + sha256
		data, ts = getandcheckfile(fileurl, sha256, size, path, outputpath, '', '')
	if data is None:
		print('failed to get '+path.decode('ascii')+' aborting')
		sys.exit(1)
	if data is not ...: #... is used to indicate that the file has been downloaded directly to disk and we don't
		                # need to write it out here.
		f = open(outputpath,'wb')
		f.write(data)
		f.close()
	os.utime(outputpath,(ts,ts))
	fdownloads.write(outputpath+b'\n')
	fdownloads.flush()


def getfilesha256andsize(fn):
	sha256hash = hashlib.sha256()
	f = open(fn, 'rb')
	l = bs
	tl = 0
	while l == bs:
		data = f.read(bs)
		l = len(data)
		tl += l
		sha256hash.update(data)
	f.close()
	sha256hashed = sha256hash.hexdigest().encode('ascii')
	return sha256hashed, tl


bs = 16 * 1024 * 1024

def getandcheckfile(fileurl, sha256, size, path, outputpath, errorfromstr, errorsuffix):
	f = None
	try:
		sha256hash = hashlib.sha256()
		if path == outputpath:
			writepath = makenewpath(path)
			viamsg = ' via '+writepath.decode('ascii')
		else:
			writepath = outputpath
			viamsg = ''
		print(
			'downloading ' + fileurl.decode('ascii') + ' with hash ' + sha256.decode(
				'ascii') + ' to ' + outputpath.decode(
				'ascii') + viamsg)
		f = open(writepath, 'wb')
		if use_urllib3:
				response = dlmanager.request("GET", fileurl.decode('ascii'), preload_content=False)
				ts = getts(fileurl, response)
				tl = 0
				for data in response.stream(bs):
						tl += len(data)
						f.write(data)
						sha256hash.update(data)
				response.release_conn()
		else:
			with urllib.request.urlopen(fileurl.decode('ascii')) as response:
				l = bs
				tl = 0
				while l == bs:
					data = response.read(bs)
					f.write(data)
					l = len(data)
					tl += l
					sha256hash.update(data)
				ts = getts(fileurl, response)

		data = ... #used as a flag to indicate that the data is written to disk rather than stored in memory
		f.close()
		if not testandreporthash(sha256hash, sha256, 'hash mismatch while downloading file' + errorfromstr + ' ', path,
							 errorsuffix):
			data = None
		elif tl != size:
			print('size mismatch while downloading file' + errorfromstr + '.' + errorsuffix)
			data = None
	except Exception as e:
		print('exception ' + str(e) + ' while downloading file' + errorfromstr + '.' + errorsuffix)
		if f is not None:
			f.close()
		data = None
		ts = None
	if data is not None:
		#success
		if writepath != outputpath:
			os.rename(writepath, outputpath)
	else:
		#failure, cleanup writepath if nessacery
		if os.path.exists(writepath):
			os.remove(writepath)

	return data, ts


def checkdatahash(data, sha256, errorprefix, path, errorsuffix):
	sha256hash = hashlib.sha256(data)
	return testandreporthash(sha256hash, sha256, errorprefix, path, errorsuffix)


def testandreporthash(sha256hash, sha256, errorprefix, path, errorsuffix):
	sha256hashed = sha256hash.hexdigest().encode('ascii')
	if (sha256 != sha256hashed):
		# print(repr(filesize))
		# print(repr(sha256))
		# print(repr(sha256hashed))
		print(errorprefix + path.decode('ascii') + ' ' + sha256.decode('ascii') + ' ' + sha256hashed.decode(
			'ascii') + errorsuffix);
		return False
	return True


if (args.mdurl is None) or (args.mdurl.upper() == 'NONE'):
	mdurl = None
else:
	mdurl = args.mdurl.encode('ascii')

if (args.hpurl is None) or (args.hpurl.upper() == 'NONE'):
	hpurl = None
else:
	hpurl = args.hpurl.encode('ascii')

if args.baseurl is None:
	baseurl = b'https://archive.raspbian.org'
	mdurl = b'http://mirrordirector.raspbian.org'
	hpurl = b'http://snapshot.raspbian.org/hashpool'
else:
	baseurl = args.baseurl.encode('ascii')


symlinkupdates = list()
fileupdates = set()

def opengu(filepath):
	#print('in opengu')
	#print('filepath = '+repr(filepath))
	#print('fileupdates = '+repr(fileupdates))
	f = None
	if (filepath in fileupdates):
		print((b'opening '+makenewpath(filepath)+b' for '+filepath).decode('ascii'))
		f = open(makenewpath(filepath),'rb')
	elif (filepath+b'.gz' in fileupdates):
		print((b'opening '+makenewpath(filepath+b'.gz')+b' for '+filepath).decode('ascii'))
		f = gzip.open(makenewpath(filepath+b'.gz'),'rb')
	elif os.path.exists(filepath):
		print((b'opening '+filepath+b' for '+filepath).decode('ascii'))
		f = open(filepath,'rb')
	elif os.path.exists(filepath+b'.gz'):
		print((b'opening '+filepath+b'.gz for '+filepath).decode('ascii'))
		f = gzip.open(filepath+b'.gz','rb')
	return f

oldsymlinks = set()
newsymlinks = set()

fdownloads = open(makenewpath(b'raspbmirrordownloads.txt'),"ab")

dlerrorcount = 0;

for stage in ("scanexisting","downloadnew","finalize"):
	if stage == "finalize":
		if dlerrorcount == 0:
			print('skipping stage 3 as there were no download failures in stage 2')
			#we can finish now.
			break
		print('stage 3, download final updates')
		
		oldknownfiles = knownfiles
		oldsymlinks |= newsymlinks
		newsymlinks = set()

	if stage == "downloadnew":
		print('stage 2, main download')
		oldknownfiles = knownfiles
		basefiles = set(oldknownfiles.keys())

	if stage == "scanexisting":
		print('stage 1, scan existing')
	else:
		if args.internal is not None:
			fileurl = args.internal.encode('ascii') + b'/snapshotindex.txt'
		else:
			fileurl = baseurl +b'/snapshotindex.txt'

		if (stage == "downloadnew") and (args.debugfif is not None):
			fileurl = args.debugfif.encode('ascii')
		(filedata,ts) = geturl(fileurl) 

		f = open(makenewpath(b'snapshotindex.txt'),'wb')
		if (args.tlwhitelist is None) and (args.distswhitelist is None):
			f.write(filedata)
		else:
			lines = filedata.split(b'\n')
			if lines[-1] == b'':
				del(lines[-1])
			if args.tlwhitelist is not None:
				tlwhitelist = set(args.tlwhitelist.encode('ascii').split(b','))
				linesnew = []
				for line in lines:
					linesplit = line.split(b'/')
					if linesplit[0] in tlwhitelist:
						linesnew.append(line)
				lines = linesnew
			if args.distswhitelist is not None:
				distswhitelist = set(args.distswhitelist.encode('ascii').split(b','))
				founddists = set()
				foundesdists = set()
				linesnew = []
				for line in lines:
					path, sizeandsha = line.split(b' ')
					pathsplit = path.split(b'/')
					#print(pathsplit)
					#print(len(pathsplit))
					if (len(pathsplit) > 2) and (pathsplit[1] == b'dists'):
						if sizeandsha[0:2] == b'->': #symlink
							target = sizeandsha[2:]
							if target in distswhitelist:
								linesnew.append(line)
						elif pathsplit[2] in distswhitelist:
							linesnew.append(line)
							founddists.add((pathsplit[0],pathsplit[2]))
							if (len(pathsplit) > 3) and (pathsplit[3] == b'extrasources'):
								foundesdists.add((pathsplit[0],pathsplit[2]))
					elif (len(pathsplit) > 1) and pathsplit[1] == b'pool':
						pass
					else:
						linesnew.append(line)
					
				lines = linesnew
				if founddists == set():
					print('none of the whitelisted distributions were found in the index file')
					sys.exit(1)
				missingesdists = founddists - foundesdists
				if missingesdists != set():
					for toplevel,distribution in missingesdists:
						print((b'missing extra sources file for '+toplevel+b'/dists/'+distribution).decode('ascii'))
					sys.exit(1)
			for line in lines:
				f.write(line+b'\n')
		f.close()
		os.utime(makenewpath(b'snapshotindex.txt'),(ts,ts))

	knownfiles = OrderedDict()
	filequeue = []

	if stage == "scanexisting":
		if os.path.isfile(b'snapshotindex.txt'):
			f = open(b'snapshotindex.txt','rb')
		else:
			continue
	else:
		f = open(makenewpath(b'snapshotindex.txt'),'rb')
	for line in f:
		line = line.strip()
		filepath, sizeandsha = line.split(b' ')
		if sizeandsha[:2] == b'->':
			symlinktarget = sizeandsha[2:]
			ensuresafepath(filepath)
			ensuresafepath(symlinktarget)
			if len(os.path.dirname(filepath)) > 0:
				os.makedirs(os.path.dirname(filepath),exist_ok=True)
			if stage == "scanexisting":
				oldsymlinks.add(filepath)
			else:
				if os.path.islink(filepath):
					if os.readlink(filepath) != symlinktarget:
						symlinkupdates.append((filepath,symlinktarget))
				else:
					print('creating symlink '+filepath.decode('ascii')+' -> '+symlinktarget.decode('ascii'))
					os.symlink(symlinktarget,filepath)
				newsymlinks.add(filepath)
		else:
			size,sha256 = sizeandsha.split(b':')
			size = int(size)
			knownfiles[filepath] = [sha256,size,'R']
			addtofilequeue(filequeue,filepath)

	f.close()

	extrasources = {}
	while filequeue:
		(priority, filepath) = heappop(filequeue)
		#print('processing '+filepath.decode('ascii'))
		sha256,size,status = knownfiles[filepath]
		if (stage != "scanexisting") and ((filepath+b'.gz' not in knownfiles) or (status == 'R') or os.path.exists(filepath)):
			getfile(filepath,sha256,size)
		pathsplit = filepath.split(b'/')
		#print(pathsplit[-1])
		#if (pathsplit[-1] == b'Packages'):
		#	print(repr(pathsplit))
		if (pathsplit[-1] == b'Release') and (pathsplit[-3] == b'dists'):
			distdir = b'/'.join(pathsplit[:-1])
			f = opengu(filepath)
			if f is None:
				if stage == 'scanexisting':
					print('warning: cannot find '+filepath.decode('ascii')+' while scanning existing state')
					continue
				else:
					print('error: cannot find '+filepath.decode('ascii')+' or a gzipped substitute, aborting')
					sys.exit(1)
			insha256 = False;
			for line in f:
				#print(repr(line[0]))
				if (line == b'SHA256:\n'):
					insha256 = True
				elif ((line[0] == 32) and insha256):
					linesplit = line.split()
					filename = distdir+b'/'+linesplit[2]
					#if filename in knownfiles:
					#	if files
					#print(filename)
					addfilefromdebarchive(knownfiles,filequeue,filename,linesplit[0],linesplit[1]);
				else:
					insha256 = False
			f.close()
		elif (pathsplit[-1] == b'Packages') and ((pathsplit[-5] == b'dists') or ((pathsplit[-3] == b'debian-installer') and (pathsplit[-6] == b'dists'))):
						if pathsplit[-5] == b'dists':
							toplevel = b'/'.join(pathsplit[:-5])
						else:
							toplevel = b'/'.join(pathsplit[:-6])
						print('found packages file: '+filepath.decode('ascii'))
						pf = opengu(filepath)
						if pf is None:
							if stage == 'scanexisting':
								print('warning: cannot find '+filepath.decode('ascii')+' while scanning existing state')
								continue
							else:
								print('error: cannot find '+filepath.decode('ascii')+' or a gzipped substitute, aborting')
								sys.exit(1)

						filename = None
						size = None
						sha256 = None
							
						for line in pf:
							linesplit = line.split()
							if (len(linesplit) == 0):
								if (filename != None):
									addfilefromdebarchive(knownfiles,filequeue,filename,sha256,size);
								filename = None
								size = None
								sha256 = None
							elif (linesplit[0] == b'Filename:'):
								filename = toplevel+b'/'+linesplit[1]
							elif (linesplit[0] == b'Size:'):
								size = linesplit[1]
							elif (linesplit[0] == b'SHA256:'):
								sha256 = linesplit[1]
						pf.close()
		elif (pathsplit[-1] == b'Sources') and (pathsplit[-5] == b'dists'):
						print('found sources file: '+filepath.decode('ascii'))
						toplevel = b'/'.join(pathsplit[:-5])
						pf = opengu(filepath)
						if pf is None:
							if stage == 'scanexisting':
								print('warning: cannot find '+filepath.decode('ascii')+' while scanning existing state')
								continue
							else:
								print('error: cannot find '+filepath.decode('ascii')+' or a gzipped substitute, aborting')
								sys.exit(1)
						filesfound = [];
						directory = None
						insha256p = False;
						for line in pf:
							linesplit = line.split()
							if (len(linesplit) == 0):
								for ls in filesfound:
									#print(repr(ls))
									addfilefromdebarchive(knownfiles,filequeue,toplevel+b'/'+directory+b'/'+ls[2],ls[0],ls[1]);
								filesfound = [];
								directory = None
								insha256p = False
							elif ((line[0] == 32) and insha256p):
								filesfound.append(linesplit)
							elif (linesplit[0] == b'Directory:'):
								insha256p = False
								directory = linesplit[1]
							elif (linesplit[0] == b'Checksums-Sha256:'):
								insha256p = True
							else:
								insha256p = False
						pf.close()
		elif (args.distswhitelist is not None) and (pathsplit[-1] == b'extrasources') and (pathsplit[-3] == b'dists'):
						print('found extrasources file: '+filepath.decode('ascii'))
						esf = opengu(filepath)
						if esf is None:
							if stage == 'scanexisting':
								print('warning: cannot find '+filepath.decode('ascii')+' while scanning existing state')
								continue
							else:
								print('error: cannot find '+filepath.decode('ascii')+' or a gzipped substitute, aborting')
								sys.exit(1)
						for line in esf:
							line = line.strip()
							filename , shaandsize = line.split(b' ')
							size , sha256 = shaandsize.split(b':')
							addfilefromdebarchive(knownfiles,filequeue,filename,sha256,size)
							extrasources[filename] = shaandsize
							#print(line)

fdownloads.close()
fdownloads = open(makenewpath(b'raspbmirrordownloads.txt'),"rb")
for line in fdownloads:
	basefiles.add(line.strip())
fdownloads.close()

def throwerror(error):
	raise error

if args.cleanup:
	towalk = os.walk('.', True, throwerror, False)
	for (dirpath, dirnames, filenames) in towalk:
		for filename in (filenames + dirnames):  # os.walk seems to regard symlinks to directories as directories.
			filepath = os.path.join(dirpath, filename)[2:].encode('ascii')  # [2:] is to strip the ./ prefix
			# print(filepath)
			if os.path.islink(filepath):
				oldsymlinks.add(filepath)
		for filename in filenames:
			filepath = os.path.join(dirpath, filename)[2:].encode('ascii')  # [2:] is to strip the ./ prefix
			if not os.path.islink(filepath) and not filepath.startswith(b'snapshotindex.txt') and not filepath.startswith(b'raspbmirrordownloads.txt'):
				basefiles.add(filepath)

print('stage 4, moves and deletions')

for filepath in fileupdates:
	print((b'renaming '+makenewpath(filepath)+b' to '+filepath).decode('ascii'))
	os.replace(makenewpath(filepath),filepath)

for (filepath,symlinktarget) in symlinkupdates:
	print('updating symlink '+filepath.decode('ascii')+' -> '+symlinktarget.decode('ascii'))
	os.remove(filepath)
	os.symlink(symlinktarget,filepath)


removedfiles = (basefiles | oldsymlinks) - (set(knownfiles.keys()) | newsymlinks)

def isemptydir(dirpath):
	#scandir would be significantly more efficient, but needs python 3.6 or above
	#which is not reasonable to expect at this time.
	#return os.path.isdir(dirpath) and ((next(os.scandir(dirpath), None)) is None)
	return os.path.isdir(dirpath) and (len(os.listdir(dirpath)) == 0)

if args.tmpdir is None:
	tmpdir = None
else:
	tmpdir = args.tmpdir.encode('ascii')
	if tmpdir[-1] != b'/':
		tmpdir += b'/'


for filepath in removedfiles:
	#file may not actually exist, either due to earlier updates gone-wrong
	#or due to the file being a non-realised uncompressed version of
	#a gzipped file.
	if os.path.exists(filepath):
		checkpath = filepath
		#if the path points into the temporary directory we only check the part of it
		#that is relative to the tempory directory.
		if tmpdir is not None and filepath.startswith(tmpdir):
			checkpath = filepath[len(tmpdir):]
		ensuresafepath(checkpath)
		print('removing '+filepath.decode('ascii'))
		os.remove(filepath)
		#clean up empty directories.
		dirpath = os.path.dirname(filepath)
		while (len(dirpath) != 0) and isemptydir(dirpath):
			print('removing empty dir '+dirpath.decode('ascii'))
			os.rmdir(dirpath)
			dirpath = os.path.dirname(dirpath)

f = open(makenewpath(b'snapshotindex.txt'),'ab')
for filename, shaandsize in extrasources.items():
	f.write(filename+b' '+shaandsize+b'\n')
f.close()

os.rename(makenewpath(b'snapshotindex.txt'),b'snapshotindex.txt')
os.remove(makenewpath(b'raspbmirrordownloads.txt'))