diff --git a/src/Codec/Xlsx/Parser/Internal/Memoize.hs b/src/Codec/Xlsx/Parser/Internal/Memoize.hs new file mode 100644 index 00000000..9d338c75 --- /dev/null +++ b/src/Codec/Xlsx/Parser/Internal/Memoize.hs @@ -0,0 +1,60 @@ +{-# LANGUAGE GeneralizedNewtypeDeriving #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE TypeApplications #-} + +-- | I rewrote: https://hackage.haskell.org/package/unliftio-0.2.20/docs/src/UnliftIO.Memoize.html#Memoized +-- for monad trans basecontrol +-- we don't need a generic `m` anyway. it's good enough in base IO. +module Codec.Xlsx.Parser.Internal.Memoize + ( Memoized + , runMemoized + , memoizeRef + ) where + +import Control.Applicative as A +import Control.Monad (join) +import Control.Monad.IO.Class +import Data.IORef +import Control.Exception + +-- | A \"run once\" value, with results saved. Extract the value with +-- 'runMemoized'. For single-threaded usage, you can use 'memoizeRef' to +-- create a value. If you need guarantees that only one thread will run the +-- action at a time, use 'memoizeMVar'. +-- +-- Note that this type provides a 'Show' instance for convenience, but not +-- useful information can be provided. +-- +-- @since 0.2.8.0 +newtype Memoized a = Memoized (IO a) + deriving (Functor, A.Applicative, Monad) +instance Show (Memoized a) where + show _ = "<>" + +-- | Extract a value from a 'Memoized', running an action if no cached value is +-- available. +-- +-- @since 0.2.8.0 +runMemoized :: MonadIO m => Memoized a -> m a +runMemoized (Memoized m) = liftIO m +{-# INLINE runMemoized #-} + +-- | Create a new 'Memoized' value using an 'IORef' under the surface. Note that +-- the action may be run in multiple threads simultaneously, so this may not be +-- thread safe (depending on the underlying action). Consider using +-- 'memoizeMVar'. +-- +-- @since 0.2.8.0 +memoizeRef :: IO a -> IO (Memoized a) +memoizeRef action = do + ref <- newIORef Nothing + pure $ Memoized $ do + mres <- readIORef ref + res <- + case mres of + Just res -> pure res + Nothing -> do + res <- try @SomeException action + writeIORef ref $ Just res + pure res + either throwIO pure res diff --git a/src/Codec/Xlsx/Parser/Stream.hs b/src/Codec/Xlsx/Parser/Stream.hs index a3e2155c..23b2c557 100644 --- a/src/Codec/Xlsx/Parser/Stream.hs +++ b/src/Codec/Xlsx/Parser/Stream.hs @@ -63,6 +63,7 @@ module Codec.Xlsx.Parser.Stream , AddCellErrors(..) , CoordinateErrors(..) , TypeError(..) + , WorkbookError(..) ) where import qualified "zip" Codec.Archive.Zip as Zip @@ -107,6 +108,7 @@ import Data.Traversable (for) import Data.XML.Types import GHC.Generics import Control.DeepSeq +import Codec.Xlsx.Parser.Internal.Memoize import qualified Codec.Xlsx.Parser.Stream.HexpatInternal as HexpatInternal import Control.Monad.Base @@ -204,9 +206,9 @@ data WorkbookInfo = WorkbookInfo makeLenses 'WorkbookInfo data XlsxMState = MkXlsxMState - { _xs_shared_strings :: IORef (Maybe (V.Vector Text)) - , _xs_workbook_info :: IORef (Maybe WorkbookInfo) - , _xs_relationships :: IORef (Maybe Relationships) + { _xs_shared_strings :: Memoized (V.Vector Text) + , _xs_workbook_info :: Memoized WorkbookInfo + , _xs_relationships :: Memoized Relationships } newtype XlsxM a = XlsxM {_unXlsxM :: ReaderT XlsxMState Zip.ZipArchive a} @@ -261,61 +263,72 @@ parseSharedStrings = \case -- | Run a series of actions on an Xlsx file runXlsxM :: MonadIO m => FilePath -> XlsxM a -> m a -runXlsxM xlsxFile (XlsxM act) = do - env0 <- liftIO $ MkXlsxMState <$> - newIORef Nothing <*> newIORef Nothing <*> newIORef Nothing - Zip.withArchive xlsxFile $ runReaderT act env0 +runXlsxM xlsxFile (XlsxM act) = liftIO $ do + -- TODO: don't run the withArchive multiple times but use liftWith or runInIO instead + _xs_workbook_info <- memoizeRef (Zip.withArchive xlsxFile readWorkbookInfo) + _xs_relationships <- memoizeRef (Zip.withArchive xlsxFile readWorkbookRelatinoships) + _xs_shared_strings <- memoizeRef (Zip.withArchive xlsxFile parseSharedStringss) + Zip.withArchive xlsxFile $ runReaderT act $ MkXlsxMState{..} liftZip :: Zip.ZipArchive a -> XlsxM a liftZip = XlsxM . ReaderT . const -{-# SCC getOrParseSharedStringss #-} -getOrParseSharedStringss :: XlsxM (V.Vector Text) -getOrParseSharedStringss = do - sharedStringsRef <- asks _xs_shared_strings - mSharedStringss <- liftIO $ readIORef sharedStringsRef - case mSharedStringss of - Just strs -> pure strs - Nothing -> do - sharedStrsSel <- liftZip $ Zip.mkEntrySelector "xl/sharedStrings.xml" - hasSharedStrs <- liftZip $ Zip.doesEntryExist sharedStrsSel - sharedStrs <- - if not hasSharedStrs +parseSharedStringss :: Zip.ZipArchive (V.Vector Text) +parseSharedStringss = do + sharedStrsSel <- Zip.mkEntrySelector "xl/sharedStrings.xml" + hasSharedStrs <- Zip.doesEntryExist sharedStrsSel + if not hasSharedStrs then pure mempty else do let state0 = initialSharedStrings - byteSrc <- liftZip $ Zip.getEntrySource sharedStrsSel - st <- runExpat state0 byteSrc $ \evs -> forM_ evs $ \ev -> do + byteSrc <- Zip.getEntrySource sharedStrsSel + st <- liftIO $ runExpat state0 byteSrc $ \evs -> forM_ evs $ \ev -> do mTxt <- parseSharedStrings ev for_ mTxt $ \txt -> ss_list %= (`DL.snoc` txt) pure $ V.fromList $ DL.toList $ _ss_list st - liftIO $ writeIORef sharedStringsRef $ Just sharedStrs - pure sharedStrs + +{-# SCC getOrParseSharedStringss #-} +getOrParseSharedStringss :: XlsxM (V.Vector Text) +getOrParseSharedStringss = runMemoized =<< asks _xs_shared_strings + +readWorkbookInfo :: Zip.ZipArchive WorkbookInfo +readWorkbookInfo = do + sel <- Zip.mkEntrySelector "xl/workbook.xml" + src <- Zip.getEntrySource sel + sheets <- liftIO $ runExpat [] src $ \evs -> forM_ evs $ \case + StartElement ("sheet" :: ByteString) attrs -> do + nm <- lookupBy "name" attrs + sheetId <- lookupBy "sheetId" attrs + rId <- lookupBy "r:id" attrs + sheetNum <- either (throwM . ParseDecimalError sheetId) pure $ eitherDecimal sheetId + modify' (SheetInfo nm (RefId rId) sheetNum :) + _ -> pure () + pure $ WorkbookInfo sheets + +lookupBy :: MonadThrow m => ByteString -> [(ByteString, Text)] -> m Text +lookupBy fields attrs = maybe (throwM $ LookupError attrs fields) pure $ lookup fields attrs -- | Returns information about the workbook, found in -- xl/workbook.xml. The result is cached so the XML will only be -- decompressed and parsed once inside a larger XlsxM action. getWorkbookInfo :: XlsxM WorkbookInfo -getWorkbookInfo = do - ref <- asks _xs_workbook_info - mInfo <- liftIO $ readIORef ref - case mInfo of - Just info -> pure info - Nothing -> do - sel <- liftZip $ Zip.mkEntrySelector "xl/workbook.xml" - src <- liftZip $ Zip.getEntrySource sel - sheets <- runExpat [] src $ \evs -> forM_ evs $ \case - StartElement ("sheet" :: ByteString) attrs -> do - nm <- maybe (throwM MalformedWorkbook) pure $ lookup ("name" :: ByteString) attrs - sheetId <- maybe (throwM MalformedWorkbook) pure $ lookup "sheetId" attrs - rId <- maybe (throwM MalformedWorkbook) pure $ lookup "r:id" attrs - sheetNum <- either (const $ throwM MalformedWorkbook) pure $ eitherDecimal sheetId - modify' (SheetInfo nm (RefId rId) sheetNum :) - _ -> pure () - let info = WorkbookInfo sheets - liftIO $ writeIORef ref $ Just info - pure info +getWorkbookInfo = runMemoized =<< asks _xs_workbook_info + +readWorkbookRelatinoships :: Zip.ZipArchive Relationships +readWorkbookRelatinoships = do + sel <- Zip.mkEntrySelector "xl/_rels/workbook.xml.rels" + src <- Zip.getEntrySource sel + liftIO $ fmap Relationships $ runExpat mempty src $ \evs -> forM_ evs $ \case + StartElement ("Relationship" :: ByteString) attrs -> do + rId <- lookupBy "Id" attrs + rTarget <- lookupBy "Target" attrs + rType <- lookupBy "Type" attrs + modify' $ M.insert (RefId rId) $ + Relationship { relType = rType, + relTarget = T.unpack rTarget + } + _ -> pure () -- | Gets relationships for the workbook (this means the filenames in -- the relationships map are relative to "xl/" base path within the @@ -324,26 +337,7 @@ getWorkbookInfo = do -- The relationships xml file will only be parsed once when called -- multiple times within a larger XlsxM action. getWorkbookRelationships :: XlsxM Relationships -getWorkbookRelationships = do - ref <- asks _xs_relationships - mInfo <- liftIO $ readIORef ref - case mInfo of - Just info -> pure info - Nothing -> do - sel <- liftZip $ Zip.mkEntrySelector "xl/_rels/workbook.xml.rels" - src <- liftZip $ Zip.getEntrySource sel - rels <- fmap Relationships $ runExpat mempty src $ \evs -> forM_ evs $ \case - StartElement ("Relationship" :: ByteString) attrs -> do - rId <- maybe (throwM MalformedWorkbook) pure $ lookup ("Id" :: ByteString) attrs - rTarget <- maybe (throwM MalformedWorkbook) pure $ lookup "Target" attrs - rType <- maybe (throwM MalformedWorkbook) pure $ lookup "Type" attrs - modify' $ M.insert (RefId rId) $ - Relationship { relType = rType, - relTarget = T.unpack rTarget - } - _ -> pure () - liftIO $ writeIORef ref $ Just rels - pure rels +getWorkbookRelationships = runMemoized =<< asks _xs_relationships type HexpatEvent = SAXEvent ByteString Text @@ -392,8 +386,8 @@ runExpat :: forall state tag text. state -> ConduitT () ByteString (C.ResourceT IO) () -> ([SAXEvent tag text] -> StateT state IO ()) -> - XlsxM state -runExpat initialState byteSource handler = liftIO $ do + IO state +runExpat initialState byteSource handler = do -- Set up state ref <- newIORef initialState -- Set up parser and callbacks @@ -424,7 +418,7 @@ runExpatForSheet :: (SheetItem -> IO ()) -> XlsxM () runExpatForSheet initState byteSource inner = - void $ runExpat initState byteSource handler + void $ liftIO $ runExpat initState byteSource handler where sheetName = _ps_sheet_index initState handler evs = forM_ evs $ \ev -> do @@ -455,6 +449,8 @@ collectItems sheetId = do newtype SheetIndex = MkSheetIndex Int deriving newtype NFData +-- | This does *no* checking if the index exists or not. +-- you could have index out of bounds issues because of this. makeIndex :: Int -> SheetIndex makeIndex = MkSheetIndex @@ -500,7 +496,7 @@ countRowsInSheet (MkSheetIndex sheetId) = do mSrc :: Maybe (ConduitT () ByteString (C.ResourceT IO) ()) <- getSheetXmlSource sheetId for mSrc $ \sourceSheetXml -> do - runExpat @Int @ByteString @ByteString 0 sourceSheetXml $ \evs -> + liftIO $ runExpat @Int @ByteString @ByteString 0 sourceSheetXml $ \evs -> forM_ evs $ \case StartElement "row" _ -> modify' (+1) _ -> pure () @@ -591,7 +587,8 @@ data TypeError deriving Show deriving anyclass Exception -data WorkbookError = MalformedWorkbook +data WorkbookError = LookupError { lookup_attrs :: [(ByteString, Text)], lookup_field :: ByteString } + | ParseDecimalError Text String deriving Show deriving anyclass Exception diff --git a/xlsx.cabal b/xlsx.cabal index 9593bdfd..c7e66c1c 100644 --- a/xlsx.cabal +++ b/xlsx.cabal @@ -84,6 +84,7 @@ Library -- We could expose it but then this function is in the xlsx API for a long time. -- It be better to expose it in the upstream library instead I think. It was copied here so the parser can use it. Other-modules: Codec.Xlsx.Parser.Stream.HexpatInternal + , Codec.Xlsx.Parser.Internal.Memoize Build-depends: base >= 4.9.0.0 && < 5.0 , attoparsec