-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Enhancement] Optimize code in arm (#55072)
## Why I'm doing: arm is slower than x86 in some cases ## What I'm doing: 1. vectorize rf's insert_hash using Neon intrinsics 2. streamvbyte's cmakelist is wrong, which cause performance downgrade in arm because vectorization cannot work properly 3. arm's int128_mul_overflow is super slow becase of divide operation, __builtin_mul_overflow(int128_t a, int128_t b, int128_t* c) is fast enough when compile with gcc. But gcc's __builtin_mul_overflow is at least 5 times faster then clang in arm, we already reported it to the community: llvm/llvm-project#123262. So we still use gcc as default compiler and use __builtin_mul_overflow to replace original int128_mul_overflow implementation 4. arm's cast int128 to double is super slow in arm with gcc because the bad implementation of __floattidf, clang runtime-rt's implementation is 20 times faster then gcc, so I used clang compiler-rt's implementation to replace gcc's version after this pr, arm is faster then gcc in the most of cases. ``` | Query | arm-opt | x86 | |---------|--------|--------| | QUERY01 | 36 | 61 | | QUERY02 | 39 | 62 | | QUERY14 | 1510 | 1514 | | QUERY15 | 1407 | 1496 | | QUERY17 | 21 | 88 | | QUERY20 | 151 | 279 | | QUERY21 | 1526 | 1529 | | QUERY24 | 1399 | 1504 | | QUERY26 | 32 | 122 | | QUERY27 | 1493 | 1519 | | QUERY90 | 3399 | 4030 | | QUERY97 | 3859 | 4776 | | QUERY98 | 2763 | 3208 | | QUERY99 | 868 | 1259 | ``` Signed-off-by: before-Sunrise <[email protected]>
- Loading branch information
1 parent
ec7e5e3
commit e88bb85
Showing
7 changed files
with
159 additions
and
21 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
// Copyright 2021-present StarRocks, Inc. All rights reserved. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// https://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
#include "runtime/int128_to_double.h" | ||
|
||
#include <glog/logging.h> | ||
|
||
#include <climits> | ||
#include <cstdint> | ||
|
||
#include "integer_overflow_arithmetics.h" | ||
namespace starrocks { | ||
double __wrap___floattidf(__int128 a) { | ||
typedef double dst_t; | ||
typedef uint64_t dst_rep_t; | ||
typedef __uint128_t usrc_t; | ||
#define DST_REP_C UINT64_C | ||
|
||
enum { | ||
dstSigBits = 52, | ||
}; | ||
|
||
if (a == 0) return 0.0; | ||
|
||
enum { | ||
dstMantDig = dstSigBits + 1, | ||
srcBits = sizeof(__int128) * CHAR_BIT, | ||
srcIsSigned = ((__int128)-1) < 0, | ||
}; | ||
|
||
const __int128 s = srcIsSigned ? a >> (srcBits - 1) : 0; | ||
|
||
a = (usrc_t)(a ^ s) - s; | ||
int sd = srcBits - clz128(a); // number of significant digits | ||
int e = sd - 1; // exponent | ||
if (sd > dstMantDig) { | ||
// start: 0000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQxxxxxxxxxxxxxxxxxx | ||
// finish: 000000000000000000000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQR | ||
// 12345678901234567890123456 | ||
// 1 = msb 1 bit | ||
// P = bit dstMantDig-1 bits to the right of 1 | ||
// Q = bit dstMantDig bits to the right of 1 | ||
// R = "or" of all bits to the right of Q | ||
if (sd == dstMantDig + 1) { | ||
a <<= 1; | ||
} else if (sd == dstMantDig + 2) { | ||
// Do nothing. | ||
} else { | ||
a = ((usrc_t)a >> (sd - (dstMantDig + 2))) | | ||
((a & ((usrc_t)(-1) >> ((srcBits + dstMantDig + 2) - sd))) != 0); | ||
} | ||
// finish: | ||
a |= (a & 4) != 0; // Or P into R | ||
++a; // round - this step may add a significant bit | ||
a >>= 2; // dump Q and R | ||
// a is now rounded to dstMantDig or dstMantDig+1 bits | ||
if (a & ((usrc_t)1 << dstMantDig)) { | ||
a >>= 1; | ||
++e; | ||
} | ||
// a is now rounded to dstMantDig bits | ||
} else { | ||
a <<= (dstMantDig - sd); | ||
// a is now rounded to dstMantDig bits | ||
} | ||
const int dstBits = sizeof(dst_t) * CHAR_BIT; | ||
const dst_rep_t dstSignMask = DST_REP_C(1) << (dstBits - 1); | ||
const int dstExpBits = dstBits - dstSigBits - 1; | ||
const int dstExpBias = (1 << (dstExpBits - 1)) - 1; | ||
const dst_rep_t dstSignificandMask = (DST_REP_C(1) << dstSigBits) - 1; | ||
// Combine sign, exponent, and mantissa. | ||
const dst_rep_t result = ((dst_rep_t)s & dstSignMask) | ((dst_rep_t)(e + dstExpBias) << dstSigBits) | | ||
((dst_rep_t)(a)&dstSignificandMask); | ||
|
||
const union { | ||
dst_t f; | ||
dst_rep_t i; | ||
} rep = {.i = result}; | ||
|
||
DCHECK(std::abs(rep.f - __real___floattidf(a)) < 0.001); | ||
return rep.f; | ||
} | ||
} // namespace starrocks |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
// Copyright 2021-present StarRocks, Inc. All rights reserved. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// https://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
namespace starrocks { | ||
extern "C" { | ||
// origin from llvm-project | ||
// https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/int_to_fp_impl.inc | ||
// this implementation is 20x faster than gcc | ||
double __wrap___floattidf(__int128 a); | ||
double __real___floattidf(__int128 a); | ||
} | ||
} // namespace starrocks |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,43 @@ | ||
From 676d0175085a7996f909d9d2e63ab7b4683ef475 Mon Sep 17 00:00:00 2001 | ||
From: before-Sunrise <[email protected]> | ||
Date: Tue, 14 Jan 2025 18:41:46 +0800 | ||
Subject: [PATCH] patch | ||
|
||
Signed-off-by: before-Sunrise <[email protected]> | ||
--- | ||
CMakeLists.txt | 5 +---- | ||
include/streamvbyte.h | 2 +- | ||
2 files changed, 2 insertions(+), 5 deletions(-) | ||
|
||
diff --git a/CMakeLists.txt b/CMakeLists.txt | ||
index 39df85d..1e32b0c 100644 | ||
--- a/CMakeLists.txt | ||
+++ b/CMakeLists.txt | ||
@@ -32,10 +32,7 @@ if (MSVC) | ||
endif() | ||
# test for arm | ||
if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)") | ||
- set(BASE_FLAGS | ||
- ${BASE_FLAGS} | ||
- "-D__ARM_NEON__" | ||
- ) | ||
+ add_compile_options(-D__ARM_NEON__) | ||
endif() | ||
set(STREAMVBYTE_SRCS | ||
${PROJECT_SOURCE_DIR}/src/streamvbyte_encode.c | ||
diff --git a/include/streamvbyte.h b/include/streamvbyte.h | ||
index bc9533c..a6cbb1a 100644 | ||
--- a/include/streamvbyte.h | ||
+++ b/include/streamvbyte.h | ||
@@ -1,7 +1,7 @@ | ||
|
||
#ifndef INCLUDE_STREAMVBYTE_H_ | ||
#define INCLUDE_STREAMVBYTE_H_ | ||
-#define __STDC_FORMAT_MACROS | ||
+// #define __STDC_FORMAT_MACROS | ||
#include <inttypes.h> | ||
#include <stdint.h>// please use a C99-compatible compiler | ||
#include <stddef.h> | ||
-- | ||
2.34.1 | ||
|