
The article [link] stated that the performance of Haskell code was superior to C ++ code. What immediately aroused interest, as both of them can be generated by the LLVM compiler, which means either Eskell can give more hints to the compiler, or something is wrong with the C ++ implementation. Next, we will analyze how a series of accidents in the author's actions led to incorrect conclusions, which are described in the table below (under the cut).
Foreword
Recently, another article from0xd34df00dHaskell code optimization. It is compared in such cases naturally with the undeniable leader in performance - C / C ++. Then came a parsing of this article fromyleoabout what asm code is really better, and what is the difference between the implementations on different languages (I recommend reading). Even earlier (about one and a half months ago), the previous article from the Haskell vs C / C ++ series was published, and I did a similar analysis, but instead of publishing it to Habr - alas, I put it in the back box. The heated discussions in the comments this week prompted me to return to the previous topic. Today I finally got that markdown document out of the drawer, dusted off the dust,finished, and provide it for your review.
Introduction
, [], :
/++, .. , , " , ". "", . , , , 10- .
-, , ++ , , , . , zero-cost, , , , - ++, . , clang 3 (!) , +, , .. lang llvm — .
, : gcc clang. , . ( ), gcc . , , , , , .
std::min({...})
, , std::min.
C++.
, std::min({delCost, insCost, substCost})
std::min(substCost, std::min(delCost, insCost))
,
clang — 0.840
, .
( — 0xd34df00d)
:
A.unsafeWrite v1 (j + 1) $ min (substCost + substCostBase) $ 1 + min delCost insCost
, min
! (, ). , , C++ , , llvm llvm. , . , , "skell " . , " ++; , PHP" , , . :
stdlib gcc - std::min
, ++. std::min_element
. , , , :
f(int, int, int):
cmp esi, edi
mov eax, edx
cmovg esi, edi
cmp esi, edx
cmovle eax, esi
ret
: cmov*
= conditional move (: g
— greater, le
— less equal, ..).
, , , , clang, gcc, - ( rsp ):
fptr(int*, int*, int*):
mov eax, dword ptr [rdi]
mov dword ptr [rsp - 12], eax
mov ecx, dword ptr [rsi]
mov dword ptr [rsp - 8], ecx
cmp ecx, eax
cmovle eax, ecx
mov ecx, dword ptr [rdx]
cmp ecx, eax
cmovle eax, ecx
ret
, clang . initializer_list
asm -O1
, (-O2
), asm . , std::min(std::initializer_list)
-, -, , .
s1[i]
, — , ++.
s1[i]
! ()
, , , - , , . , s1[i]
,
let s1char = s1 `BS.index` i
let go j | j == n = pure ()
++, , clang. .. + llvm - , -march=native
. , , std::min
, , ! , - , " " , .
C++ .
,
C main', .
( )
, , , - , :
size_t lev_dist(const std::string &s1, const std::string &s2) {
const auto m = s1.size();
const auto n = s2.size();
std::vector<int> v0;
v0.resize(n + 1);
std::iota(v0.begin(), v0.end(), 0);
auto v1 = v0;
for (size_t i = 0; i < m; ++i) {
v1[0] = i + 1;
char c1 = s1[i];
for (size_t j = 0; j < n; ++j) {
auto substCost = c1 == s2[j] ? v0[j] : (v0[j] + 1);
v1[j + 1] = std::min(substCost, std::min(v0[j + 1], v1[j]) + 1);
}
std::swap(v0, v1);
}
return v0[n];
}
32- int
— , ( ).
, GCC . j
, GCC. clang .
LLVM == LLVM
-, , ++ Haskell , clang-9. , Skylake C++ . , , , Haswell, .
, , , GCC LLVM.
, , llvm, ffi, .
GCC vs CLANG
-, , gcc . , clang- .
, , , , (#if !defined(__GNUC__) || defined(__llvm__)
), ++ , , ++ .
clang ( ) . ( )
, GCC LLVM. , asm. gcc - : , cmov* min ( , ). (3), , , ++ :
for (size_t j = 0; j < n; ++j) {
auto delCost = v0[j + 1] + 1;
auto insCost = v1[j] + 1;
auto substCost = c1 == s2[j] ? v0[j] : (v0[j] + 1);
v1[j + 1] = std::min(substCost, std::min(delCost, insCost));
}
, , :
.L42:
inc rcx // j++
mov rdi, QWORD PTR [r12+rcx*8] // v0[j+1]
xor edx, edx // %edx
cmp r10b, BYTE PTR [r11-1+rcx] // c1 == s2[j]
setne dl // %rdx
lea r9, [rdi+1] // v0[j+1] + 1
add rdx, QWORD PTR [r12-8+rcx*8] // v0[j]
lea rsi, [rax+1] // %rax v1[j]
cmp rdi, rax // v0[j+1] v1[j] += 1
mov rax, r9
cmovg rax, rsi // += 1
cmp rax, rdx // %rax, %rdx
cmovg rax, rdx
mov QWORD PTR [r8+rcx*8], rax // v1[j+1] = ...
cmp rbx, rcx // loop
jne .L42
, — v1[j]
%rax
.
LLVM, - , . , , :
.LBB1_40: # in Loop: Header=BB1_36 Depth=2
mov qword ptr [r14 + 8*rsi + 8], rax
mov rdx, qword ptr [rbx + 8*rsi + 16]
inc rdx
inc rax
xor ebp, ebp
cmp cl, byte ptr [r13 + rsi + 1]
setne bpl
add rbp, qword ptr [rbx + 8*rsi + 8]
cmp rax, rdx
jg .LBB1_41
lea rdi, [rsi + 2]
cmp rax, rbp
jle .LBB1_44
jmp .LBB1_43
: jmp, j*
= jump (: jg
— greater, jle
— less equal, ..).
v0[j+1]
, v0[j]
, cmp
s1[i]
, cmp + jump . , , ( , ) , , . — .
, GCC , LLVM 2 (!) , .
, . : (jump), — (cmov).
, — .
, , . , , PGO (, JIT ). , GCC PGO clang. — , . , , // , , , .
- , —
- , LLVM
- , GCC LLVM ,
- ffi. , .. , , ,
, , .
: , , "" , , . Rust (, ).
P.S.
— , -
( )
, — , .. , , , s1 s2. . -, O(n*m) ( for). , . , k. , . , .
LinearLeopard .
- :
-O3 -march=native -std=gnu++17
. - Processor: Intel i5-8250U (yes, laptop)
- OS: Ubuntu 19.04 / Linux 5.0.0
- The first run to accelerate the turbo boost, then take a minimum of five in a row. Deviations within 1-2%.
- Between the launches of different 1s implementations, we’ll cool down (yes, laptop)
Added: scripts for the lazy
You can run the same thing on your hardware and tell the public the result: a link to the github .
Added: Results without -march = native
According to requests in the comments, I decided to check the influence of this flag.