Alegrias e tristezas das vitórias sobre C: fazemos doces a partir do protótipo wc em Haskell

Olá, Habr.


Portanto, a última vez que provamos empiricamente que você pode escrever facilmente um tipo de brinquedo wc no Haskell, que é significativamente mais rápido que a implementação do GNU Coreutils wc. É claro que essa não é uma comparação completamente honesta: nosso programa não pode fazer nada além de contar bytes, seqüências de caracteres e palavras, enquanto o wc real é muito mais poderoso: possui mais algumas estatísticas, suporta opções, pode ler a partir de stdin ... Em resumo, nós realmente acabou sendo apenas um brinquedo.


Hoje vamos consertar. Nosso principal objetivo é permitir que o usuário selecione estatísticas específicas para o cálculo, sem contar o que o usuário não precisa. E o mais importante: nos esforçaremos pela modularidade, destacando cada estatística em uma unidade isolada separada.


De fato, se olharmos para a versão C - bem, pessoalmente, eu não chamaria de amostra de código legível e suportado, já que tudo acontece lá em uma grande função em 370 linhas. Vamos tentar evitar isso.



A principal função da versão C não se encaixava na tela 4k na orientação retrato com a quarta fonte.


Além dessa modularização, nós, entre outras coisas:


  • vamos expressar a ideia de que algumas estatísticas, como contar o número de bytes, podem funcionar com mais eficiência em toda a entrada, enquanto outras devem olhar para cada byte;
  • implementamos ainda mais estatísticas, aproveitando a oportunidade para falar sobre cada uma delas individualmente (o que é chamado de raciocínio local);
  • escreveremos alguns testes, aproveitando o raciocínio local mais uma vez;
  • tentaremos algumas técnicas de digitação quase dependente, tendo recebido com êxito o código de funcionamento correto, mas encantadoramente de frenagem;
  • Template Haskell;
  • () () .

, :


{-# LANGUAGE Strict #-}
{-# LANGUAGE RecordWildCards #-}

module Data.WordCount where

import qualified Data.ByteString.Lazy as BS

data State = State
  { bs :: Int
  , ws :: Int
  , ls :: Int
  , wasSpace :: Int
  }

wc :: BS.ByteString -> (Int, Int, Int)
wc s = (bs, ws + 1 - wasSpace, ls)
  where
    State { .. } = BS.foldl' go (State 0 0 0 1) s

    go State { .. } c = State (bs + 1) (ws + addWord) (ls + addLine) isSp
      where
        isSp | c == 32 || c - 9 <= 4 = 1
             | otherwise = 0
        addLine | c == 10 = 1
                | otherwise = 0
        addWord = (1 - wasSpace) * isSp
{-# INLINE wc #-}

, , . ?



, . , BS.foldl'!


foldl, «, ». , ! , , ByteString. , : ( , length ), count ( count 10). , , !


, , . :


{-# LANGUAGE Strict #-}

import qualified Control.Foldl as L
import qualified Data.ByteString as BS

data WordState = WordState { ws :: Int, wasSpace :: Int }

wordsCount :: L.Fold BS.ByteString Int
wordsCount = L.Fold (BS.foldl' go) (WordState 0 1) (\WordState { .. } -> ws + 1 - wasSpace)
  where
    go WordState { .. } c = WordState (ws + addWord) isSp
      where
        isSp | c == 32 || c - 9 <= 4 = 1
             | otherwise = 0
        addWord = (1 - wasSpace) * isSp

, :


import qualified Control.Foldl.ByteString as BL
import qualified Data.ByteString.Lazy as BSL

main :: IO ()
main = do
  [path] <- getArgs
  contents <- unsafeMMapFile path
  let res = BL.fold ((,,) <$> BL.length <*> BL.count 10 <*> wordsCount) (BSL.fromStrict contents) :: (Int, Int, Int)
  print res

! ?


, ( 1.8- , tmpfs- IO, ), 2.5 . , , , ( wc, ), IDE, .


, 2.5 . , .


, ?


  let res = BL.fold ((,) <$> BL.length <*> wordsCount) contents :: (Int, Int)
  print res

1.55 . . ?


  let res = BL.fold (BL.count 10) contents :: Int
  print res

1.05 .


. . ! , ( '\n') , .


. , , foldl.


foldl . . foldl ByteString', ByteString', . — 256 , L1-, L2 L1 ( L1 ).


, , 16-32 , L1, .


, , , , : BL.fold ((,) <$> wordsCount <*> wordsCount) contents ( , ) BL.fold wordsCount contents. .


, , « », , .


...



, , . - . ?


- «» ( , ) f1, f2, f3, f1 f3. , , ,


  -- options —   ,   
  options <- parseCliOptions

  -- theFold —  
  let theFold = foldl' f (zip options [f1, f2, f3]) emptyFold
  where
    f acc (True, stat) = acc `compose` stat
    f acc (False, _) = acc

, , .


, — . , , , .



?


— . , , , :


{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE MultiParamTypeClasses, UndecidableInstances, FlexibleInstances #-}
{-# LANGUAGE TypeFamilyDependencies, FunctionalDependencies, PolyKinds, DataKinds, GADTs, TypeOperators #-}
{-# LANGUAGE ScopedTypeVariables #-}

- ? :


  • ,
  • -, ,
  • -,
  • .

:


class Statistic s res st | res -> s, st -> s
                         , s -> res, s -> st
                         where
  initState :: st
  extractState :: st -> res
  step :: st -> Word8 -> st

s ( DataKinds), , , . , - :


  initState :: proxy1 s -> proxy2 res -> st
  extractState :: proxy s -> st -> res
  step :: proxy1 s -> proxy2 res -> st -> Word8 -> st

, , , .


, , , , .



, - . , , , . , , . , O(n)O(1). , (, , ) «» ( SIMD-, ). , . , , .


, , , ? GADT! - , GADT , . :


data StatCompTyOf = Chunked | ByteOnly

data StatComputation st compTy where
  ChunkedComputation :: (st -> Word8 -> st)
                     -> (st -> BS.ByteString -> st)
                     -> StatComputation st 'Chunked
  ByteOnlyComputation :: (st -> Word8 -> st)
                      -> StatComputation st 'ByteOnly

( ) BS — , .


Statistic, comp step computation:


class Statistic s res st comp | res -> s, st -> s
                              , s -> res, s -> st, s -> comp
                              where
  initState :: st
  extractState :: st -> res
  computation :: StatComputation st comp

. s, res, st, .


, , . , SIMD- , 16-32 . .



? :


  • ,
  • (UTF-8)-,
  • ,
  • ,
  • .

:


data Statistics = Bytes | Chars | Words | MaxLL | Lines deriving (Eq, Ord)

, wc .


Statistic. , — , :


newtype Tagged a = Tagged Word64 deriving (Eq, Show, Num)

a , Tagged 'Bytes Tagged 'Chars.


: :


instance Statistic 'Bytes (Tagged 'Bytes) (Tagged 'Bytes) 'Chunked where
  initState = 0
  extractState = id
  computation = ChunkedComputation (\st _ -> st + 1) (\st str -> st + fromIntegral (BS.length str))

, , :


  1. , Bytes , Tagged 'Bytes , . , .
  2. ( , ) 0.
  3. , , — .
  4. computation , 'Chunked . , .

.


, , ,


.

, , :


instance Statistic 'Lines (Tagged 'Lines) (Tagged 'Lines) 'Chunked where
  initState = 0
  extractState = id
  computation = ChunkedComputation (\st c -> st + if c == 10 then 1 else 0) (\st str -> st + fromIntegral (BS.count 10 str))

? :


data WordsState = WordsState { ws :: Word64, wasSpace :: Word64 }

instance Statistic 'Words (Tagged 'Words) WordsState 'ByteOnly where
  initState = WordsState 0 1
  extractState WordsState { .. } = Tagged (ws + 1 - wasSpace)
  computation = ByteOnlyComputation step
    where
      step WordsState { .. } c = WordsState (ws + (1 - wasSpace) * isSp) isSp
        where
          isSp | c == 32 || c - 9 <= 4 = 1
               | otherwise = 0

, .


, , . — UTF-8- ?


Toda a dificuldade de contar caracteres é manipular com precisão os bits:


instance Statistic 'Chars (Tagged 'Chars) (Tagged 'Chars) 'ByteOnly where
  initState = 0
  extractState = id
  computation = ByteOnlyComputation $ \cnt c ->
        cnt + 1 - fromIntegral (   ((c .&. 0b10000000) `shiftR` 7)
                               .&. (1 - ((c .&. 0b01000000) `shiftR` 6))
                               )

UTF-8: , 10xxxxxx. , UTF-8 , .


? (, , , ASCII):


instance Statistic 'MaxLL (Tagged 'MaxLL) MaxLLState 'ByteOnly where
  initState = MaxLLState 0 0
  extractState MaxLLState { .. } = Tagged $ max maxLen curLen
  computation = ByteOnlyComputation step
    where
      step MaxLLState { .. } 9 = MaxLLState maxLen $ curLen + 8 - (curLen `rem` 8)
      step MaxLLState { .. } 8 = MaxLLState maxLen $ max 0 (curLen - 1)
      step MaxLLState { .. } c | c == 10
                              || c == 12
                              || c == 13 = MaxLLState (max maxLen curLen) 0
                               | c < 32 = MaxLLState maxLen curLen
      step MaxLLState { .. } _ = MaxLLState maxLen (curLen + 1)

, backspace, wc!


, . : .



a — , b — , — , . :


infixr 5 :::
data a ::: b = a ::: b deriving (Show)

(,), , , , , .


, .


-, ? , , . :


type family CombineCompTy a b where
  CombineCompTy 'Chunked 'Chunked = 'Chunked
  CombineCompTy _ _ = 'ByteOnly

Statistic ? - :


instance (Statistic sa resa sta compa, Statistic sb resb stb compb)
       => Statistic (sa '::: sb) (resa ::: resb) (sta ::: stb) (CombineCompTy compa compb) where
  initState = initState ::: initState
  extractState (a ::: b) = extractState a ::: extractState b
  computation =
    case (computation :: StatComputation sta compa, computation :: StatComputation stb compb) of
         (ByteOnlyComputation a, ChunkedComputation b _)
            -> ByteOnlyComputation $ combine a b
         (ChunkedComputation a _, ByteOnlyComputation b)
            -> ByteOnlyComputation $ combine a b
         (ByteOnlyComputation a, ByteOnlyComputation b) 
            -> ByteOnlyComputation $ combine a b
         (ChunkedComputation stepA chunkA, ChunkedComputation stepB chunkB)
            -> ChunkedComputation (combine stepA stepB) (combine chunkA chunkB)
    where
      combine fa fb = \(a ::: b) w -> fa a w ::: fb b w

, saresa, sta compa, sb/resb/stb/compb, sa ::: sb — , — resa ::: resb, — sta ::: stb, — CombineCompTy compa compb.


( , ) :::--- ( , ) :::--- . sa sb, , () , — , () .


, … . . , comp , :


instance (Statistic sa resa sta compa,
          Statistic sb resb stb compb,
          comp ~ CombineCompTy compa compb)
       => Statistic (sa '::: sb) (resa ::: resb) (sta ::: stb) comp where

.


. :


  1. .
  2. , - .

, — , !


, CombineCompTy ( ). , , ( ), .


- , , , , , case .


? , StatComputation st comp, , , comp ~ CombineCompTy compa compb. , compa compb. CombineCompTy, , Chunked compa, compb, .


compa compb? . , computation, GADT. , StatComputation. ChunkedComputation, comp Chunked. ByteOnlyComputation, ByteOnly.


, CombineCompTy _-, , compa, compb.


, : Words '::: Words , . ⊤ , , , .



, . ?


, Statistic, ByteString, . GADT, computation. ChunkedComputation, . ByteOnlyComputation, BS.foldl'. :


wc :: forall s res st comp. Statistic s res st comp => BS.ByteString -> res
wc s = extractState $! runCompute computation
  where
    runCompute :: StatComputation st comp -> st
    runCompute (ByteOnlyComputation step) = BS.foldl' step initState s
    runCompute (ChunkedComputation _ chunker) = chunker initState s

, (s, st, comp) res. , , runCompute, . st comp wc, ( ) forall ScopedTypeVariables.


:


let result = wc someBS :: Tagged 'Words ::: Tagged 'Lines

TypeApplications s :


let result = wc @('Words '::: 'Lines) someBS

, , , , .



, ?


, wc @'Words, . — 1.51 , , , . , .


? wc @('Words '::: 'Words)!


. , , , — , … : 1.34 . wc @('Words '::: 'Words '::: 'Words)? 1.30 . , 'Words .


— . #haskell — . .


. GHC Core — , . , , , - . , … -. , , , .


, , . , ? wc @('Bytes '::: 'Words '::: 'Lines)! — 1.53 . 1.45 , , , , .


, , . , .



— ! , .


, , , words, 127. , , .


QuickCheck-, ASCII, UTF-8-:


import qualified Data.ByteString.Char8 as BS
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
--   

import Data.WordCount

wrapUnicode :: UnicodeString -> (BS.ByteString, T.Text)
wrapUnicode ustr = (T.encodeUtf8 txt, txt)
  where
    txt = T.pack $ getUnicodeString ustr

replaceNonAsciiSpaces :: Char -> Char
replaceNonAsciiSpaces ch | ch >= chr 127 && isSpace ch = '_'
                         | otherwise = ch

main :: IO ()
main = hspec $ parallel $ modifyMaxSuccess (const 10000) $ modifyMaxSize (const 1000) $ do
  describe "ASCII support" $ do
    it "Counts bytes correctly" $ property $
      \(getASCIIString -> str) -> wc @'Bytes (BS.pack str) `shouldBe` genericLength str
    it "Counts chars correctly" $ property $
      \(getASCIIString -> str) -> wc @'Chars (BS.pack str) `shouldBe` genericLength str
    it "Counts words correctly" $ property $
      \(getASCIIString -> str) -> wc @'Words (BS.pack str) `shouldBe` genericLength (words str)
    it "Counts lines correctly" $ property $
      \(getASCIIString -> str) -> wc @'Lines (BS.pack str) `shouldBe` genericLength (filter (== '\n') str)
  describe "UTF8 support" $ do
    it "Counts bytes correctly" $ property $
      \(wrapUnicode -> (bs, _))   -> wc @'Bytes bs `shouldBe` fromIntegral (BS.length bs)
    it "Counts chars correctly" $ property $
      \(wrapUnicode -> (bs, txt)) -> wc @'Chars bs `shouldBe` fromIntegral (T.length txt)
    it "Counts words correctly" $ property $
      \(wrapUnicode -> (bs, txt)) -> wc @'Words bs `shouldBe` genericLength (T.words $ T.map replaceNonAsciiSpaces txt)
    it "Counts lines correctly" $ property $
      \(wrapUnicode -> (bs, txt)) -> wc @'Lines bs `shouldBe` fromIntegral (T.count "\n" txt)

!


:


  • , , . , - UTF-8- .
  • , … .
  • : 10 ( ) 3-5 ( ).

, - C GNU Coreutils.



optparse-applicative. , , :


data Options = Options
  { countBytes :: Bool
  , countChars :: Bool
  , countLines :: Bool
  , countMaxLineLength :: Bool
  , countWords :: Bool
  , files :: [FilePath]
  }

options :: Parser Options
options = Options
  <$> switch (long "bytes" <> short 'c' <> help "print the byte counts")
  <*> switch (long "chars" <> short 'm' <> help "print the character counts")
  <*> switch (long "lines" <> short 'l' <> help "print the newline counts")
  <*> switch (long "max-line-length" <> short 'L' <> help "print the maximum display width")
  <*> switch (long "words" <> short 'w' <> help "print the word counts")
  <*> some (argument str (metavar "FILES..."))

main, Statistics, , :


main :: IO ()
main = do
  Options { .. } <- execParser $ info (options <**> helper) (fullDesc <> progDesc "Print newline, word, and byte counts for each file")
  let selectedStats = map snd $ filter fst [ (countBytes, Bytes), (countChars, Chars)
                                           , (countWords, Words), (countMaxLineLength, MaxLL)
                                           , (countLines, Lines)
                                           ]
  let stats | null selectedStats = [Bytes, Words, Lines]
            | otherwise = selectedStats

, , - .


, . ? , wc. , , . !



, , , , - , — !


, , , .


, , :


data SomeStats where
  MkSomeStats :: Statistic s res st comp => proxy s -> SomeStats

proxy sStatistic. — .


. ? - :


wc' :: SomeStats -> BS.ByteString -> ?
wc' (MkSomeStats (_ :: proxy s)) input = wc @s input

?? ? , res Statistic, .


, - ?


data SomeStats where
  MkSomeStats :: Statistic s res st comp => proxy1 s -> proxy2 res -> SomeStats

wc' :: SomeStats -> BS.ByteString -> res
wc' (MkSomeStats (_ :: proxy1 s) (_ :: proxy2 res)) input = wc @s input

, : wc' res, SomeStats, .


?


. wc, ! , . , , , , show, , res Show:


data SomeStats where
  MkSomeStats :: (Statistic s res st comp, Show res) => proxy s -> SomeStats

wc :


wc' :: SomeStats -> BS.ByteString -> String
wc' (MkSomeStats (_ :: proxy s)) input = show $ wc @s input

.


, stats SomeStats? :


promoteStat :: Statistics -> SomeStats
promoteStat Bytes = MkSomeStats (Proxy :: Proxy 'Bytes)
promoteStat Chars = MkSomeStats (Proxy :: Proxy 'Chars)
promoteStat Words = MkSomeStats (Proxy :: Proxy 'Words)
promoteStat MaxLL = MkSomeStats (Proxy :: Proxy 'MaxLL)
promoteStat Lines = MkSomeStats (Proxy :: Proxy 'Lines)

, , Statistics, . - : , Bytes () 'Bytes , , .


, promoteStat :


promoteStats :: [Statistics] -> SomeStats
promoteStats [s] = promoteStat s
promoteStats (s:ss) =
  case (promoteStat s, promoteStats ss) of
       (MkSomeStats (_ :: proxy1 st), MkSomeStats (_ :: proxy2 sst))
                                   -> MkSomeStats (Proxy :: Proxy (st '::: sst))

, , promoteStat.


, . : promoteStat, promoteStats. - , . promoteStat promoteStats, st , , sst — , .


, Statistic ( ). Statistic, st ::: sst Statistic - , ! , , rest resst ( , st sst) Show. , rest ::: resst Show, st ::: sst!


, , MkSomeStats (Proxy :: Proxy (st '::: sst)) . , !


, : . , , NonEmpty .


, :


main :: IO ()
main = do
  -- obtaining `stats` as before
  forM_ files $ \path -> do
    contents <- unsafeMMapFile path
    putStrLn $ wc' (promoteStats stats) contents


() ?


, 1.05 — , BS.count 10.


, . , , ? , … 14 .


, 14 .


, , :


  74,873,139,008 bytes allocated in the heap

, . , O(1)— GC (60,512 bytes maximum residency).


. , , ? 27 , 120 .


, ? ?


«42 », : 41 194 . , RTS 60 .


?


, :


wc' (MkSomeStats (_ :: proxy s)) input = show $ wc @s input

, computation s — -, , . , Statistic .


?


wc' computation, SomeStats wc, . , , , . , 1.8 — !


promoteStats. ?


, stats ,


promoteStats [s] = promoteStat s

s , , Words, promoteStat


promoteStat Words = MkSomeStats (Proxy :: Proxy 'Words)

promoteStats SomeStats , Statistic Words.


. , ?


promoteStats (s:ss) =
  case (promoteStat s, promoteStats ss) of
         (MkSomeStats (_ :: proxy1 st), MkSomeStats (_ :: proxy2 sst)) -> MkSomeStats (Proxy :: Proxy (st '::: sst))

computation : case , promoteStat promoteStats, , , computation Statistic «» sa ::: sb, , , .


, 13-14 , 28 — . , 65-70 .


, ( ) .


, . 13 (14 1 -) 1.8 — , 7 . .



: , () wc . -


main = do
  -- ..
  case stats of
       [Words] -> print $ wc @'Words contents
       [Bytes] -> print $ wc @'Bytes contents
       [Lines] -> print $ wc @'Lines contents
       [Words, Bytes] -> print $ wc @('Words '::: 'Bytes) contents
       [Lines, Bytes] -> print $ wc @('Lines '::: 'Bytes) contents
       -- ...

. , , 251=31case. , — .


, Template Haskell. (-) dispatch, :


  contents <- unsafeMMapFile path
  putStrLn $ $(dispatch 'wc 'contents) stats

$(dispatch 'wc 'contents) , case- stats , .


dispatch — Template Haskell, , :


dispatch :: Name -> Name -> Q Exp
dispatch fun bs = reify ''Statistics >>= \case
  TyConI (DataD _ _ _ _ cons _) -> do
    let consNames = [ name | NormalC name _ <- cons ]
    let powerset = filterM (const [True, False]) consNames
    let matches = buildMatch fun bs <$> filter (not . null) powerset
    fallbackMatch <- (\body -> Match WildP (NormalB body) []) <$> [e| error "Unexpected input" |]
    pure $ LamCaseE $ matches <> [fallbackMatch]
  _ -> fail "unsupported type"

buildMatch :: Name -> Name -> [Name] -> Match
buildMatch fun bs consNames = Match (ListP $ (`ConP` []) <$> consNames) (NormalB $ VarE 'show `AppE` (wcCall `AppE` VarE bs)) []
  where
    wcCall = VarE fun `AppTypeE` foldr1 f (PromotedT <$> consNames)
    f accTy promotedTy = PromotedT '(:::) `AppT` accTy `AppT` promotedTy

, Statistics (, , ), ( , powerset) case- buildMatch. -case ( {-# LANGUAGE LambdaCase #-}).


, , , , .


, (, TH) — , stats . . ,


  let selectedStats = map snd $ filter fst [ (countBytes, Bytes), (countChars, Chars)
                                           , (countWords, Words), (countMaxLineLength, MaxLL)
                                           , (countLines, Lines)
                                           ]

, Ord. , stats. , , - stats, , .


!


main :: IO ()
main = do
  Options { .. } <- execParser $ info (options <**> helper) (fullDesc <> progDesc "Print newline, word, and byte counts for each file")
  let selectedStats = map snd $ filter fst [ (countBytes, Bytes), (countChars, Chars)
                                           , (countWords, Words), (countMaxLineLength, MaxLL)
                                           , (countLines, Lines)
                                           ]
  let stats | null selectedStats = [Bytes, Words, Lines]
            | otherwise = selectedStats
  forM_ files $ \path -> do
    contents <- unsafeMMapFile path
    putStrLn $ $(dispatch 'wc 'contents) stats


? (31 — ), - . , wc GNU Coreutils , -. , : 5 1.8 , . wc Coreutils LC_ALL=C LANG=C, .


:


Haskell wc,Coreutils wc,
0.00 ¹0.00 ¹
1.5412.5
1.2012.5
1.06 / 0.24 ²0.26
1.5212.5
1.428.45 ³
2.2112.5
2.9212.5

:


  • , , .
  • , C (, , ) , . .
  • ¹ , time 0.00user .
  • ² (1.06 ) — bytestring. — bytestring, (count) SIMD-. count C, , , , ( , SIMD), .
  • ³ wc UTF-8-, , , , UTF-8-.

, .



, , wc, . ?



: forM_ forConcurrently_ async:


main :: IO ()
main = do
  Options { .. } <- execParser $ info (options <**> helper) (fullDesc <> progDesc "Print newline, word, and byte counts for each file")
  let selectedStats = map snd $ filter fst [ (countBytes, Bytes), (countChars, Chars)
                                           , (countWords, Words), (countMaxLineLength, MaxLL)
                                           , (countLines, Lines)
                                           ]
  let stats | null selectedStats = [Bytes, Words, Lines]
            | otherwise = selectedStats
  forConcurrently_ files $ \path -> do
    contents <- unsafeMMapFile path
    putStrLn $ $(dispatch 'wc 'contents) stats

, (, RTS, ). , -j, .


?


1.22 — , .


, ( ), 1.47 , . , .



,


Tagged 123 ::: (Tagged 456 ::: Tagged 789)

, ! Statistic:


class Statistic s res st comp | res -> s, st -> s
                              , s -> res, s -> st, s -> comp where
  -- ...
  prettyPrint :: res -> String

, :


instance Statistic 'Bytes (Tagged 'Bytes) (Tagged 'Bytes) 'Chunked where
  -- ...
  prettyPrint (Tagged n) = show n <> " bytes"

, :


  prettyPrint (a ::: b) = prettyPrint a <> "\n" <> prettyPrint b

buildMatch, prettyPrint show:


buildMatch fun bs consNames = Match (ListP $ (`ConP` []) <$> consNames) (NormalB $ VarE 'prettyPrint `AppE` (wcCall `AppE` VarE bs)) []

!



, mmap. , : , hwc <(cat foo | grep bar).


, , — - mmap . mmap . ByteString, . ByteString — , :


import qualified Data.ByteString.Lazy as BSL

wcLazy :: forall s res st comp. Statistic s res st comp => BSL.ByteString -> res
wcLazy s = extractState $! runCompute computation
  where
    runCompute :: StatComputation st comp -> st
    runCompute (ByteOnlyComputation step) = BSL.foldl' step initState s
    runCompute (ChunkedComputation _ chunker) = BSL.foldlChunks chunker initState s

main:


  forConcurrently_ files $ \path -> do
    stat <- getFileStatus path
    if isRegularFile stat || isSymbolicLink stat
      then countStrict stats $ unsafeMMapFile path
      else countLazy stats $ BSL.readFile path

:


  where
    countStrict stats act = do
      contents <- act
      putStrLn $ $(dispatch 'wc 'contents) stats
    countLazy stats act = do
      contents <- act
      putStrLn $ $(dispatch 'wcLazy 'contents) stats

!


, wcLazy , , , .


stdin


, Coreutils wc, — stdin. , main:


main = do
  -- ... as before ...

  when (null files) $ countLazy stats BSL.getContents

-


cat testfile.txt | /usr/bin/time hwc-exe -cw

? , , 1.40 — , 1.22 .



, - : ? , , !


« » ( - ), stack build 7.9 , 2.24 ( strip).


Template Haskell : 23 . , ( 31 ): 2.34 , — 4.3%.


stack build ( ). stack build --fast .


« » 5.6 --fast, TH … 7.8 . . .


, , C. , : wc GNU Coreutils 0.06 , — 21 . wc 24 . 100 , !


, , .



? , wc , Unix-, -wc , GNU Coreutils wc. — ( !), , , , . , - « , », C C++, C.


, :


  1. wc , , , ;
  2. ;
  3. , , ;
  4. , , , , , ;
  5. Template Haskell;
  6. , , ;
  7. , .

.


, :


  1. . , , , 16-32 ( L1). , 7 , 1.8 0.4 , 0.04%. Template Haskell!
  2. Nossa implementação ainda não é o equivalente total wc, faz algo diferente. Por exemplo, as estatísticas de contagem de caracteres não suportam codificações diferentes de UTF-8 ou ASCII, enquanto ele wctrabalha com uma localidade arbitrária, delegando todo o trabalho de processamento de caracteres glibc. Mas como nossa implementação facilita a adição de novas estatísticas, é muito simples adicionar suporte para algo assim e, o mais importante, não afetará, por exemplo, a contagem do número de linhas ou palavras. Mas sobre "não pague pelo que você não usa", eu já escrevi um pouco mais.

All Articles