From f7c4ed69e8223c1d3e662a8219afbff6d0dc8fa7 Mon Sep 17 00:00:00 2001 From: Simon Tzanakis Date: Wed, 4 Sep 2024 09:15:04 +0200 Subject: [PATCH 1/2] MET-6103 Cleanup deprecations on http harvesting --- .../http/AbstractHttpHarvestIterator.java | 180 +++++++++++++++++ .../metis/harvesting/http/HttpHarvester.java | 57 +----- .../harvesting/http/HttpHarvesterImpl.java | 189 +----------------- .../harvesting/http/HttpRecordIterator.java | 41 ---- 4 files changed, 192 insertions(+), 275 deletions(-) create mode 100644 metis-harvesting/src/main/java/eu/europeana/metis/harvesting/http/AbstractHttpHarvestIterator.java delete mode 100644 metis-harvesting/src/main/java/eu/europeana/metis/harvesting/http/HttpRecordIterator.java diff --git a/metis-harvesting/src/main/java/eu/europeana/metis/harvesting/http/AbstractHttpHarvestIterator.java b/metis-harvesting/src/main/java/eu/europeana/metis/harvesting/http/AbstractHttpHarvestIterator.java new file mode 100644 index 000000000..00c0f7e81 --- /dev/null +++ b/metis-harvesting/src/main/java/eu/europeana/metis/harvesting/http/AbstractHttpHarvestIterator.java @@ -0,0 +1,180 @@ +package eu.europeana.metis.harvesting.http; + +import eu.europeana.metis.harvesting.FullRecord; +import eu.europeana.metis.harvesting.HarvesterException; +import eu.europeana.metis.harvesting.HarvestingIterator; +import eu.europeana.metis.harvesting.ReportingIteration; +import eu.europeana.metis.harvesting.ReportingIteration.IterationResult; +import eu.europeana.metis.utils.CompressedFileExtension; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.lang.invoke.MethodHandles; +import java.nio.file.FileVisitResult; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.SimpleFileVisitor; +import java.nio.file.attribute.BasicFileAttributes; +import java.time.Instant; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.Predicate; +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.IOUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Iterator for harvesting + */ +abstract class AbstractHttpHarvestIterator implements HarvestingIterator { + + private static final Logger LOGGER = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private final Path extractedDirectory; + + protected AbstractHttpHarvestIterator(Path extractedDirectory) { + if (extractedDirectory == null) { + throw new IllegalStateException("Extracted directory is null. This should not happen."); + } + this.extractedDirectory = extractedDirectory; + } + + protected String getExtractedDirectory() { + return extractedDirectory.toString(); + } + + @Override + public void close() { + try { + FileUtils.deleteDirectory(extractedDirectory.toFile()); + } catch (IOException e) { + LOGGER.warn("Could not delete directory.", e); + } + } + + /** + * Iterate through the record paths while applying a filter (potentially skipping some records). + * + * @param action The iteration to perform. It needs to return a result. + * @param filter The filter to apply (only records that return true will be sent to the action). + * @throws HarvesterException In case there was a problem while harvesting. + */ + public void forEachPathFiltered(ReportingIteration action, Predicate filter) + throws HarvesterException { + try { + Files.walkFileTree(extractedDirectory, new FileIteration(action, filter)); + } catch (IOException e) { + throw new HarvesterException("Exception while iterating through the extracted files.", e); + } + } + + /** + * Iterate through the {@link FullRecord} while applying a filter (potentially skipping some records). + * + * @param action The iteration to perform. It needs to return a result. + * @param filter The filter to apply (only records that return true will be sent to the action). + * @throws HarvesterException In case there was a problem while harvesting. + */ + public void forEachFileFiltered(ReportingIteration action, Predicate filter) + throws HarvesterException { + forEachPathFiltered(path -> { + try (InputStream content = Files.newInputStream(path)) { + return action.process(new FullRecordImpl(extractedDirectory.relativize(path).toString(), + new ByteArrayInputStream(IOUtils.toByteArray(content)))); + } catch (RuntimeException e) { + throw new IOException("Could not process path " + path + ".", e); + } + }, filter); + } + + @Override + public void forEachNonDeleted(ReportingIteration action) throws HarvesterException { + forEach(action); + } + + @Override + public Integer countRecords() throws HarvesterException { + // Go by each path only: no need to inspect the full file. + final AtomicInteger counter = new AtomicInteger(0); + forEachPathFiltered(path -> { + counter.incrementAndGet(); + return IterationResult.CONTINUE; + }, path -> true); + return counter.get(); + } + + private static class FileIteration extends SimpleFileVisitor { + + private static final String MAC_TEMP_FILE = ".DS_Store"; + private static final String MAC_TEMP_FOLDER = "__MACOSX"; + + private final ReportingIteration action; + private final Predicate filter; + + public FileIteration(ReportingIteration action, Predicate filter) { + this.action = action; + this.filter = filter; + } + + @Override + public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { + if (!filter.test(file)) { + return FileVisitResult.CONTINUE; + } + final Path fileName = file.getFileName(); + if (fileName != null && MAC_TEMP_FILE.equals(fileName.toString())) { + return FileVisitResult.CONTINUE; + } + if (CompressedFileExtension.forPath(file) != null) { + return FileVisitResult.CONTINUE; + } + final IterationResult result = action.process(file); + if (result == null) { + throw new IllegalArgumentException("Iteration result cannot be null."); + } + return result == IterationResult.TERMINATE ? FileVisitResult.TERMINATE + : FileVisitResult.CONTINUE; + } + + @Override + public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) { + final Path dirName = dir.getFileName(); + if (dirName != null && MAC_TEMP_FOLDER.equals(dirName.toString())) { + return FileVisitResult.SKIP_SUBTREE; + } + return FileVisitResult.CONTINUE; + } + } + + record FullRecordImpl(String relativeFilePath, ByteArrayInputStream entryContent) implements FullRecord { + + @Override + public String getHarvestingIdentifier() { + return relativeFilePath; + } + + @Override + public void writeContent(OutputStream outputStream) throws IOException { + IOUtils.copy(entryContent, outputStream); + } + + @Override + public ByteArrayInputStream getContent() { + return entryContent; + } + + @Override + public boolean isDeleted() { + return false; + } + + @Override + public Instant getTimeStamp() { + return null; + } + } +} + + + + diff --git a/metis-harvesting/src/main/java/eu/europeana/metis/harvesting/http/HttpHarvester.java b/metis-harvesting/src/main/java/eu/europeana/metis/harvesting/http/HttpHarvester.java index a4aa09c65..cc56f3beb 100644 --- a/metis-harvesting/src/main/java/eu/europeana/metis/harvesting/http/HttpHarvester.java +++ b/metis-harvesting/src/main/java/eu/europeana/metis/harvesting/http/HttpHarvester.java @@ -6,10 +6,8 @@ import eu.europeana.metis.harvesting.HarvestingIterator; import eu.europeana.metis.harvesting.ReportingIteration; import eu.europeana.metis.utils.CompressedFileExtension; -import java.io.ByteArrayInputStream; import java.io.InputStream; import java.nio.file.Path; -import java.util.function.Consumer; /** * Implementations of this interface provide the functionality to harvest from HTTP (compressed archive). @@ -27,7 +25,7 @@ public interface HttpHarvester { * @return An iterator that provides access to the decompressed records. * @throws HarvesterException In case there was an issue during the harvest. */ - HttpRecordIterator harvestRecords(String archiveUrl, String downloadDirectory) + HarvestingIterator harvestRecords(String archiveUrl, String downloadDirectory) throws HarvesterException; /** @@ -41,34 +39,7 @@ HttpRecordIterator harvestRecords(String archiveUrl, String downloadDirectory) * @throws HarvesterException In case there was an issue during the harvest. */ void harvestFullRecords(InputStream inputStream, CompressedFileExtension compressedFileType, - ReportingIteration action) throws HarvesterException; - - /** - * Harvest from HTTP (compressed archive). This is a convenience method for {@link #harvestRecords(String, String)} that copies - * the input stream to a temporary file (in the system's temporary directory) first. An attempt will be made to remove the - * temporary file before this method returns. - * - * @param inputStream The input stream containing the compressed archive. - * @param compressedFileType The type of the archive. - * @param action The action to be performed. - * @throws HarvesterException In case there was an issue during the harvest. - * @deprecated Use {@link #harvestFullRecords(InputStream, CompressedFileExtension, ReportingIteration)} instead. - */ - @Deprecated - void harvestRecords(InputStream inputStream, CompressedFileExtension compressedFileType, - Consumer action) throws HarvesterException; - - /** - * It creates a {@link HttpRecordIterator} with a InputStream into a temporary file directory. When finished using the created - * iterator, the iterator should be closed to clean up leftover files. - * - * @param input The input stream from which we create the iterator - * @param compressedFileType The type of compressed file type - * @return A HttpRecordIterator based on a temporary file location - * @throws HarvesterException In case there is an issue while using the input stream - */ - HttpRecordIterator createTemporaryHttpHarvestIterator(InputStream input, - CompressedFileExtension compressedFileType) throws HarvesterException; + ReportingIteration action) throws HarvesterException; /** * It creates a {@link HarvestingIterator} with a InputStream into a temporary file directory. When finished using the created @@ -79,29 +50,7 @@ HttpRecordIterator createTemporaryHttpHarvestIterator(InputStream input, * @return A HttpRecordIterator based on a temporary file location * @throws HarvesterException In case there is an issue while using the input stream */ - FullRecordHarvestingIterator createFullRecordHarvestIterator(InputStream input, + FullRecordHarvestingIterator createFullRecordHarvestIterator(InputStream input, CompressedFileExtension compressedFileType) throws HarvesterException; - /** - * An object representing an entry in a file archive. The harvesting identifier is the file name - * (including the path relative to the archive root). - */ - interface ArchiveEntry extends FullRecord { - - /** - * @return The name of the entry. This is the file name (including extension, excluding the path). - * @deprecated Use {@link #getHarvestingIdentifier()} instead. - */ - @Deprecated - default String getEntryName() { - return Path.of(getHarvestingIdentifier()).getFileName().toString(); - } - - /** - * @return The content of the entry (in memory). - * @deprecated Use {@link #getContent()} instead. - */ - @Deprecated - ByteArrayInputStream getEntryContent(); - } } diff --git a/metis-harvesting/src/main/java/eu/europeana/metis/harvesting/http/HttpHarvesterImpl.java b/metis-harvesting/src/main/java/eu/europeana/metis/harvesting/http/HttpHarvesterImpl.java index 4bc5aa1ee..e39d20168 100644 --- a/metis-harvesting/src/main/java/eu/europeana/metis/harvesting/http/HttpHarvesterImpl.java +++ b/metis-harvesting/src/main/java/eu/europeana/metis/harvesting/http/HttpHarvesterImpl.java @@ -4,14 +4,13 @@ import static eu.europeana.metis.utils.TempFileUtils.createSecureTempDirectoryAndFile; import static org.apache.commons.io.FileUtils.copyInputStreamToFile; +import eu.europeana.metis.harvesting.FullRecord; import eu.europeana.metis.harvesting.FullRecordHarvestingIterator; import eu.europeana.metis.harvesting.HarvesterException; import eu.europeana.metis.harvesting.HarvestingIterator; import eu.europeana.metis.harvesting.ReportingIteration; -import eu.europeana.metis.harvesting.ReportingIteration.IterationResult; import eu.europeana.metis.utils.CompressedFileExtension; import eu.europeana.metis.utils.CompressedFileHandler; -import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; @@ -20,21 +19,14 @@ import java.net.URISyntaxException; import java.net.URL; import java.net.URLConnection; -import java.nio.file.FileVisitResult; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; -import java.nio.file.SimpleFileVisitor; -import java.nio.file.attribute.BasicFileAttributes; import java.nio.file.attribute.PosixFilePermission; -import java.time.Instant; import java.util.Iterator; import java.util.Set; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.function.Consumer; import java.util.function.Predicate; import java.util.stream.Stream; -import org.apache.commons.io.FileUtils; import org.apache.commons.io.FilenameUtils; import org.apache.commons.io.IOUtils; import org.slf4j.Logger; @@ -51,9 +43,9 @@ public class HttpHarvesterImpl implements HttpHarvester { @Override public void harvestFullRecords(InputStream inputStream, - CompressedFileExtension compressedFileType, ReportingIteration action) + CompressedFileExtension compressedFileType, ReportingIteration action) throws HarvesterException { - try (final HarvestingIterator iterator = createFullRecordHarvestIterator(inputStream, compressedFileType)) { + try (final HarvestingIterator iterator = createFullRecordHarvestIterator(inputStream, compressedFileType)) { iterator.forEach(action); } catch (IOException e) { throw new HarvesterException("Could not clean up.", e); @@ -61,16 +53,7 @@ public void harvestFullRecords(InputStream inputStream, } @Override - public void harvestRecords(InputStream inputStream, CompressedFileExtension compressedFileType, - Consumer action) throws HarvesterException { - this.harvestFullRecords(inputStream, compressedFileType, file -> { - action.accept(file); - return IterationResult.CONTINUE; - }); - } - - @Override - public HttpRecordIterator harvestRecords(String archiveUrl, String downloadDirectory) + public HarvestingIterator harvestRecords(String archiveUrl, String downloadDirectory) throws HarvesterException { // Download the archive. Note that we allow any directory here (even on other file systems), @@ -89,17 +72,11 @@ public HttpRecordIterator harvestRecords(String archiveUrl, String downloadDirec } @Override - public FullRecordHarvestingIterator createFullRecordHarvestIterator(InputStream input, + public FullRecordHarvestingIterator createFullRecordHarvestIterator(InputStream input, CompressedFileExtension compressedFileType) throws HarvesterException { return new RecordIterator(extractArchiveSecurely(input, compressedFileType)); } - @Override - public HttpRecordIterator createTemporaryHttpHarvestIterator(InputStream input, - CompressedFileExtension compressedFileType) throws HarvesterException { - return new PathIterator(extractArchiveSecurely(input, compressedFileType)); - } - private Path extractArchiveSecurely(InputStream input, CompressedFileExtension compressedFileType) throws HarvesterException { try { @@ -186,21 +163,21 @@ private static void correctDirectoryRights(Path directory) throws IOException { } } - private static class RecordIterator extends AbstractHttpHarvestIterator - implements FullRecordHarvestingIterator { + private static class RecordIterator extends AbstractHttpHarvestIterator + implements FullRecordHarvestingIterator { public RecordIterator(Path extractedDirectory) { super(extractedDirectory); } @Override - public void forEachFiltered(ReportingIteration action, Predicate filter) + public void forEachFiltered(ReportingIteration action, Predicate filter) throws HarvesterException { forEachFileFiltered(action, filter); } } - private static class PathIterator extends AbstractHttpHarvestIterator implements HttpRecordIterator { + private static class PathIterator extends AbstractHttpHarvestIterator { public PathIterator(Path extractedDirectory) { super(extractedDirectory); @@ -216,153 +193,5 @@ public void forEachFiltered(ReportingIteration action, Predicate fil public String getExtractedDirectory() { return super.getExtractedDirectory(); } - - @Override - public void deleteIteratorContent() { - this.close(); - } - } - - /** - * Iterator for harvesting - */ - private abstract static class AbstractHttpHarvestIterator implements HarvestingIterator { - - private final Path extractedDirectory; - - public AbstractHttpHarvestIterator(Path extractedDirectory) { - if (extractedDirectory == null) { - throw new IllegalStateException("Extracted directory is null. This should not happen."); - } - this.extractedDirectory = extractedDirectory; - } - - protected String getExtractedDirectory() { - return extractedDirectory.toString(); - } - - @Override - public void close() { - try { - FileUtils.deleteDirectory(extractedDirectory.toFile()); - } catch (IOException e) { - LOGGER.warn("Could not delete directory.", e); - } - } - - public void forEachPathFiltered(ReportingIteration action, Predicate filter) - throws HarvesterException { - try { - Files.walkFileTree(extractedDirectory, new FileIteration(action, filter)); - } catch (IOException e) { - throw new HarvesterException("Exception while iterating through the extracted files.", e); - } - } - - public void forEachFileFiltered(ReportingIteration action, Predicate filter) - throws HarvesterException { - forEachPathFiltered(path -> { - try (InputStream content = Files.newInputStream(path)) { - return action.process(new ArchiveEntryImpl(extractedDirectory.relativize(path).toString(), - new ByteArrayInputStream(IOUtils.toByteArray(content)))); - } catch (RuntimeException e) { - throw new IOException("Could not process path " + path + ".", e); - } - }, filter); - } - - @Override - public void forEachNonDeleted(ReportingIteration action) throws HarvesterException { - forEach(action); - } - - @Override - public Integer countRecords() throws HarvesterException { - // Go by each path only: no need to inspect the full file. - final AtomicInteger counter = new AtomicInteger(0); - forEachPathFiltered(path -> { - counter.incrementAndGet(); - return IterationResult.CONTINUE; - }, path -> true); - return counter.get(); - } - } - - private static class FileIteration extends SimpleFileVisitor { - - private static final String MAC_TEMP_FILE = ".DS_Store"; - private static final String MAC_TEMP_FOLDER = "__MACOSX"; - - private final ReportingIteration action; - private final Predicate filter; - - public FileIteration(ReportingIteration action, Predicate filter) { - this.action = action; - this.filter = filter; - } - - @Override - public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { - if (!filter.test(file)) { - return FileVisitResult.CONTINUE; - } - final Path fileName = file.getFileName(); - if (fileName != null && MAC_TEMP_FILE.equals(fileName.toString())) { - return FileVisitResult.CONTINUE; - } - if (CompressedFileExtension.forPath(file) != null) { - return FileVisitResult.CONTINUE; - } - final IterationResult result = action.process(file); - if (result == null) { - throw new IllegalArgumentException("Iteration result cannot be null."); - } - return result == IterationResult.TERMINATE ? FileVisitResult.TERMINATE - : FileVisitResult.CONTINUE; - } - - @Override - public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) { - final Path dirName = dir.getFileName(); - if (dirName != null && MAC_TEMP_FOLDER.equals(dirName.toString())) { - return FileVisitResult.SKIP_SUBTREE; - } - return FileVisitResult.CONTINUE; - } - } - - private record ArchiveEntryImpl(String relativeFilePath, ByteArrayInputStream entryContent) - implements ArchiveEntry { - - @Override - public String getHarvestingIdentifier() { - return relativeFilePath; - } - - @Override - public void writeContent(OutputStream outputStream) throws IOException { - IOUtils.copy(entryContent, outputStream); - } - - @Override - public ByteArrayInputStream getContent() { - return entryContent; - } - - @Override - public boolean isDeleted() { - return false; - } - - @Override - @Deprecated - public ByteArrayInputStream getEntryContent() { - return getContent(); - } - - @Override - public Instant getTimeStamp() { - return null; - } } } diff --git a/metis-harvesting/src/main/java/eu/europeana/metis/harvesting/http/HttpRecordIterator.java b/metis-harvesting/src/main/java/eu/europeana/metis/harvesting/http/HttpRecordIterator.java deleted file mode 100644 index f8891458d..000000000 --- a/metis-harvesting/src/main/java/eu/europeana/metis/harvesting/http/HttpRecordIterator.java +++ /dev/null @@ -1,41 +0,0 @@ -package eu.europeana.metis.harvesting.http; - -import eu.europeana.metis.harvesting.HarvesterException; -import eu.europeana.metis.harvesting.HarvestingIterator; -import java.nio.file.Path; - -/** - * Implementations of this interface provide iterative access to the decompressed results of a HTTP - * (compressed archive) harvest. Closing this iterator cleans up the downloaded and decompressed - * files. - * @deprecated Use {@link HarvestingIterator} instead. - */ -@Deprecated -public interface HttpRecordIterator extends HarvestingIterator { - - /** - * Returns the extracted directory used to create the iterator if there is any - * - * @return The extracted directory as a string if there is any, empty string if there is none - * - * @deprecated This method seems to be used only to compute the relative path of the file. Use - * {@link eu.europeana.metis.harvesting.http.HttpHarvester.ArchiveEntry#getHarvestingIdentifier()} - * instead. - */ - @Deprecated - String getExtractedDirectory(); - - /** - * @deprecated Use {@link #close()} instead. - */ - @Deprecated - void deleteIteratorContent(); - - /** - * @deprecated Use {@link #countRecords()} instead. - */ - @Deprecated - default int getExpectedRecordCount() throws HarvesterException { - return countRecords(); - } -} From 7526b9ea242e031340e71137f3d966a4433974e9 Mon Sep 17 00:00:00 2001 From: Simon Tzanakis Date: Thu, 5 Sep 2024 13:37:53 +0200 Subject: [PATCH 2/2] MET-6103 Process review --- .../harvesting/http/AbstractHttpHarvestIterator.java | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/metis-harvesting/src/main/java/eu/europeana/metis/harvesting/http/AbstractHttpHarvestIterator.java b/metis-harvesting/src/main/java/eu/europeana/metis/harvesting/http/AbstractHttpHarvestIterator.java index 00c0f7e81..353de5435 100644 --- a/metis-harvesting/src/main/java/eu/europeana/metis/harvesting/http/AbstractHttpHarvestIterator.java +++ b/metis-harvesting/src/main/java/eu/europeana/metis/harvesting/http/AbstractHttpHarvestIterator.java @@ -17,6 +17,7 @@ import java.nio.file.SimpleFileVisitor; import java.nio.file.attribute.BasicFileAttributes; import java.time.Instant; +import java.util.Objects; import java.util.concurrent.atomic.AtomicInteger; import java.util.function.Predicate; import org.apache.commons.io.FileUtils; @@ -33,9 +34,7 @@ abstract class AbstractHttpHarvestIterator implements HarvestingIterator