From a25531dee2ed7a4bfd6d104c9219603c1edea1d5 Mon Sep 17 00:00:00 2001 From: Robert Lemke Date: Tue, 17 Jan 2023 09:50:12 +0100 Subject: [PATCH] Introduce gcs:clean CLI command The new command gcs:clean compares objects found in the storage bucket with resources registered in the database and offers to delete obsolete objects from the Google Cloud Storage bucket. See ./flow help gcs:clean for details. #43 --- Classes/Command/GcsCommandController.php | 226 ++++++++++++++++++++++- README.md | 3 +- 2 files changed, 227 insertions(+), 2 deletions(-) diff --git a/Classes/Command/GcsCommandController.php b/Classes/Command/GcsCommandController.php index 21f5191..5d676eb 100644 --- a/Classes/Command/GcsCommandController.php +++ b/Classes/Command/GcsCommandController.php @@ -13,7 +13,11 @@ * source code. */ +use Doctrine\DBAL\Connection; +use Doctrine\DBAL\Exception; +use Doctrine\DBAL\Types\Types; use Doctrine\ORM\EntityManagerInterface; +use Flownative\Google\CloudStorage\GcsStorage; use Flownative\Google\CloudStorage\GcsTarget; use Flownative\Google\CloudStorage\StorageFactory; use Google\Cloud\Core\Exception\NotFoundException; @@ -22,6 +26,7 @@ use Neos\Flow\Cli\CommandController; use Neos\Flow\ResourceManagement\ResourceManager; use Neos\Flow\ResourceManagement\Storage\StorageObject; +use Symfony\Component\Console\Formatter\OutputFormatterStyle; /** * Google Cloud Storage command controller @@ -30,6 +35,8 @@ */ final class GcsCommandController extends CommandController { + private const TEMPORARY_TABLE_NAME = 'flownative_google_cloudstorage_temp'; + /** * @Flow\Inject * @var StorageFactory @@ -42,6 +49,11 @@ final class GcsCommandController extends CommandController */ protected $resourceManager; + public function initializeObject(): void + { + $this->output->getOutput()->getFormatter()->setStyle('hint', new OutputFormatterStyle('yellow')); + } + /** * Checks the connection * @@ -204,7 +216,7 @@ public function updateResourceMetadataCommand(string $collection = 'persistent', $object = $targetBucket->object($targetKeyPrefix . $resourceRecord['sha1']); $object->update(['contentType' => $resourceRecord['mediatype']]); $this->outputLine(' ✅ %s %s ', [$resourceRecord['sha1'], $resourceRecord['filename']]); - } catch (ServiceException | NotFoundException $exception) { + } catch (ServiceException|NotFoundException $exception) { $this->outputLine(' ❌ %s %s', [$resourceRecord['sha1'], $resourceRecord['filename']]); } } @@ -215,4 +227,216 @@ public function updateResourceMetadataCommand(string $collection = 'persistent', } $this->outputLine(); } + + /** + * Clean obsolete objects + * + * This command iterates through all objects / files in the Google Cloud Storage bucket + * which is configured as a storage for the specified Flow resource collection. It then + * checks if a corresponding Persistent Resource exists in Flow. If no such resource exists in + * the database, this command can delete the object from the Google Cloud Storage bucket. + * + * This command will ask for confirmation before deleting anything and can only be used + * interactively. + * + * If the option --export-to-file is specified, this command exports a list of SHA1 hashes + * of those objects which *would* be deleted. No object will be deleted if the --export-to-file + * is specified. + * + * @param string $exportToFile Path and filename of a file to write to. If specified, this command will not delete obsolete objects, but write a list of SHA1 hashes which would be deleted to this file + * @param string $collection Name of the Flow resource collection to consider. If not specified, "persistent" will be used. + * @return void + * @throws + */ + public function cleanCommand(string $exportToFile = '', string $collection = 'persistent'): void + { + $storage = $this->getStorageFromCollection($collection); + $connection = $this->getDatabaseConnection(); + $this->createTemporaryTable($connection); + + $this->outputLine('Preparing to clean up obsolete objects in Google Cloud Storage'); + $this->outputLine('Using storage bucket %s', [$storage->getBucketName()]); + + $storageObjectsResourceHashesCount = $connection->executeQuery('SELECT COUNT(*) FROM flownative_google_cloudstorage_temp')->fetchOne(); + + if ($storageObjectsResourceHashesCount > 0) { + $this->outputLine(); + $this->outputLine('Found analysis data from a previous run'); + if ($this->output->askConfirmation('Proceed with existing data? ', true)) { + $this->outputLine('→ Using results from previous run'); + } else { + $this->outputLine('Removing analysis data from previous run'); + $connection->executeQuery('TRUNCATE TABLE ' . self::TEMPORARY_TABLE_NAME); + $storageObjectsResourceHashesCount = 0; + } + } + + if ($storageObjectsResourceHashesCount === 0) { + $storageObjectsResourceHashesCount = $this->retrieveStorageObjectsResourcesHashes($storage, $connection); + } + + $this->outputLine('The bucket contains %s storage objects', [$storageObjectsResourceHashesCount]); + + $this->outputLine(); + $this->outputLine('Matching object hashes with resources in the database ...', [$storage->getBucketName()]); + + $query = <<executeQuery($query); + $obsoleteObjectsCount = $result->rowCount(); + if ($obsoleteObjectsCount > 0) { + $this->outputLine('Found %s objects in Google Cloud Storage which have no corresponding Persistent Resource object', [$result->rowCount()]); + $row = $result->fetchAssociative(); + $this->outputLine('For example, the object with SHA1 %s is likely obsolete and can be deleted from the bucket', [$row['sha1']]); + $this->outputLine(); + + if ($exportToFile !== '') { + $this->outputLine('Exporting hashes of obsolete objects to "%s" ...', [$exportToFile]); + $this->exportObsoleteObjectHashesToFile($exportToFile, $connection, $query); + } else { + if (!$this->output->askConfirmation(sprintf('Proceed with deletion of %s obsolete objects in Google Cloud Storage? ', $result->rowCount()), false)) { + $this->outputLine('Nothing was deleted'); + $this->dropTemporaryTable($connection); + exit; + } + $this->deleteObsoleteObjects($storage, $connection, $query, $obsoleteObjectsCount); + } + } else { + $this->outputLine('Found no obsolete objects in this Google Cloud Storage bucket'); + } + + $this->dropTemporaryTable($connection); + $this->outputLine('Done, memory peak usage was %s MB', [(string)(round(memory_get_peak_usage(true) / 1000000))]); + } + + /** + * @throws Exception + */ + private function retrieveStorageObjectsResourcesHashes(GcsStorage $storage, Connection $connection): int + { + $this->outputLine(); + $this->outputLine('Retrieving list of objects from Google Cloud Storage ...', [$storage->getBucketName()]); + + try { + $storageClient = $this->storageFactory->create(); + } catch (\Exception $e) { + $this->outputLine('%s', [$e->getMessage()]); + exit(1); + } + + $storageBucket = $storageClient->bucket($storage->getBucketName()); + $storageKeyPrefix = $storage->getKeyPrefix(); + + $this->output->progressStart(); + $storageObjectsCount = 0; + + $connection->executeQuery('TRUNCATE TABLE ' . self::TEMPORARY_TABLE_NAME); + foreach ($storageBucket->objects(['prefix' => $storageKeyPrefix])->iterateByPage() as $pageNumber => $objects) { + foreach ($objects as $object) { + assert($object instanceof \Google\Cloud\Storage\StorageObject); + $storageObjectsCount++; + $connection->insert(self::TEMPORARY_TABLE_NAME, ['sha1' => $object->name()]); + } + $this->output->progressSet($storageObjectsCount); + } + + $this->output->progressFinish(); + $this->outputLine(); + return $storageObjectsCount; + } + + private function getStorageFromCollection(string $collectionName): GcsStorage + { + $collection = $this->resourceManager->getCollection($collectionName); + if (!$collection) { + $this->outputLine('The collection %s does not exist.', [$collectionName]); + exit(1); + } + + $storage = $collection->getStorage(); + if (!$storage instanceof GcsStorage) { + $this->outputLine('The storage defined in collection %s is not a Google Cloud Storage storage.', [$collectionName]); + exit(1); + } + return $storage; + } + + private function getDatabaseConnection(): Connection + { + if ($this->objectManager->isRegistered(EntityManagerInterface::class)) { + $entityManager = $this->objectManager->get(EntityManagerInterface::class); + } else { + $entityManager = $this->objectManager->get(\Doctrine\Common\Persistence\ObjectManager::class); + } + return $entityManager->getConnection(); + } + + private function createTemporaryTable(Connection $connection): void + { + $schemaManager = $connection->getSchemaManager(); + if ($schemaManager === null) { + $this->outputLine('Failed retrieving the schema manager from the DBAL connection'); + exit(1); + } + if (!$schemaManager->tablesExist(self::TEMPORARY_TABLE_NAME)) { + $schema = $schemaManager->createSchema(); + $table = $schema->createTable(self::TEMPORARY_TABLE_NAME); + $table->addColumn('sha1', Types::STRING, ['length' => strlen(sha1('something'))]); + $schemaManager->createTable($table); + } + } + + private function dropTemporaryTable(Connection $connection): void + { + $schemaManager = $connection->getSchemaManager(); + if ($schemaManager === null) { + $this->outputLine('Failed retrieving the schema manager from the DBAL connection'); + exit(1); + } + $schemaManager->dropTable(self::TEMPORARY_TABLE_NAME); + } + + private function exportObsoleteObjectHashesToFile(string $targetPathAndFilename, Connection $connection, string $query): void + { + try { + $exportFile = fopen($targetPathAndFilename, 'wb'); + foreach ($connection->iterateAssociative($query) as $row) { + fwrite($exportFile, $row['sha1'] . PHP_EOL); + } + fclose($exportFile); + } catch (\Throwable $throwable) { + $this->outputLine('%s', [$throwable->getMessage()]); + exit(1); + } + } + + private function deleteObsoleteObjects(GcsStorage $storage, Connection $connection, string $query, int $obsoleteObjectsCount): void + { + try { + $storageClient = $this->storageFactory->create(); + $storageBucket = $storageClient->bucket($storage->getBucketName()); + $storageKeyPrefix = $storage->getKeyPrefix(); + + $this->output->progressStart($obsoleteObjectsCount); + foreach ($connection->iterateAssociative($query) as $row) { + $storageBucket->object($storageKeyPrefix . $row['sha1'])->delete(); + /** @noinspection DisconnectedForeachInstructionInspection */ + $this->output->progressAdvance(); + } + $this->output->progressFinish(); + $this->outputLine(); + } catch (\Throwable $throwable) { + $this->outputLine('%s', [$throwable->getMessage()]); + exit(1); + } + + } } diff --git a/README.md b/README.md index 2b28f63..2c15f21 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,8 @@ this adaptor also works nicely for all kinds of assets in Neos. - store all assets or only a specific collection in a private GCS bucket - publish assets to a private or public GCS bucket - supports GZIP compression for selected media types -- command line interface for basic tasks like connection check or emptying an GCS bucket +- command line interface for tasks like connection check, emptying an GCS + bucket or consistency check with clean up Using this connector, you can run a Neos website which does not store any asset (images, PDFs etc.) on your local webserver.