Skip to content

Commit

Permalink
[Enhancement] Optimize code in arm (#55072)
Browse files Browse the repository at this point in the history
## Why I'm doing:
arm is slower than x86 in some cases

## What I'm doing:
1. vectorize rf's insert_hash using Neon intrinsics
2. streamvbyte's cmakelist is wrong, which cause performance downgrade in arm because vectorization cannot work properly
3. arm's int128_mul_overflow is super slow becase of divide operation,  __builtin_mul_overflow(int128_t a, int128_t b, int128_t* c) is fast enough when compile with gcc. But gcc's __builtin_mul_overflow is at least 5 times faster then clang in arm, we already reported it to the community: llvm/llvm-project#123262. So we still use gcc as default compiler and use __builtin_mul_overflow to replace original int128_mul_overflow implementation
4. arm's cast int128 to double  is super slow in arm with gcc because the bad implementation of __floattidf, clang runtime-rt's implementation is 20 times faster then gcc, so I used clang compiler-rt's implementation to replace gcc's version


after this pr, arm is faster then gcc in  the most of cases.

```
| Query   | arm-opt | x86 |
|---------|--------|--------|
| QUERY01 | 36     | 61     |
| QUERY02 | 39     | 62     |
| QUERY14 | 1510   | 1514   |
| QUERY15 | 1407   | 1496   |
| QUERY17 | 21     | 88     |
| QUERY20 | 151    | 279    |
| QUERY21 | 1526   | 1529   |
| QUERY24 | 1399   | 1504   |
| QUERY26 | 32     | 122    |
| QUERY27 | 1493   | 1519   |
| QUERY90 | 3399   | 4030   |
| QUERY97 | 3859   | 4776   |
| QUERY98 | 2763   | 3208   |
| QUERY99 | 868    | 1259   |
```

Signed-off-by: before-Sunrise <[email protected]>
  • Loading branch information
before-Sunrise authored Jan 27, 2025
1 parent ec7e5e3 commit e88bb85
Show file tree
Hide file tree
Showing 7 changed files with 159 additions and 21 deletions.
2 changes: 1 addition & 1 deletion be/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1010,7 +1010,7 @@ endif()

set(STARROCKS_LINK_LIBS ${STARROCKS_LINK_LIBS}
${WL_LINK_STATIC} -lbfd
${WL_LINK_DYNAMIC} -lresolv -liberty -lc -lm -ldl -rdynamic -pthread -Wl,-wrap=__cxa_throw
${WL_LINK_DYNAMIC} -lresolv -liberty -lc -lm -ldl -rdynamic -pthread -Wl,-wrap,__cxa_throw -Wl,-wrap,__floattidf
)

# link gcov if WITH_GCOV is on
Expand Down
9 changes: 9 additions & 0 deletions be/src/exprs/runtime_filter.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,15 @@ class SimdBlockFilter {
const __m256i mask = make_mask(hash >> _log_num_buckets);
__m256i* const bucket = &reinterpret_cast<__m256i*>(_directory)[bucket_idx];
_mm256_store_si256(bucket, _mm256_or_si256(*bucket, mask));
#elif defined(__ARM_NEON)
uint32x4_t masks[2];
make_mask(hash >> _log_num_buckets, masks);
uint32x4_t directory_1 = vld1q_u32(&_directory[bucket_idx][0]);
uint32x4_t directory_2 = vld1q_u32(&_directory[bucket_idx][4]);
directory_1 = vorrq_u32(directory_1, masks[0]);
directory_2 = vorrq_u32(directory_2, masks[1]);
vst1q_u32(&_directory[bucket_idx][0], directory_1);
vst1q_u32(&_directory[bucket_idx][4], directory_2);
#else
uint32_t masks[BITS_SET_PER_BLOCK];
make_mask(hash >> _log_num_buckets, masks);
Expand Down
1 change: 1 addition & 0 deletions be/src/runtime/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ set(RUNTIME_FILES
dictionary_cache_sink.cpp
type_pack.cpp
customized_result_writer.cpp
int128_to_double.cpp
)

set(RUNTIME_FILES ${RUNTIME_FILES}
Expand Down
94 changes: 94 additions & 0 deletions be/src/runtime/int128_to_double.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "runtime/int128_to_double.h"

#include <glog/logging.h>

#include <climits>
#include <cstdint>

#include "integer_overflow_arithmetics.h"
namespace starrocks {
double __wrap___floattidf(__int128 a) {
typedef double dst_t;
typedef uint64_t dst_rep_t;
typedef __uint128_t usrc_t;
#define DST_REP_C UINT64_C

enum {
dstSigBits = 52,
};

if (a == 0) return 0.0;

enum {
dstMantDig = dstSigBits + 1,
srcBits = sizeof(__int128) * CHAR_BIT,
srcIsSigned = ((__int128)-1) < 0,
};

const __int128 s = srcIsSigned ? a >> (srcBits - 1) : 0;

a = (usrc_t)(a ^ s) - s;
int sd = srcBits - clz128(a); // number of significant digits
int e = sd - 1; // exponent
if (sd > dstMantDig) {
// start: 0000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQxxxxxxxxxxxxxxxxxx
// finish: 000000000000000000000000000000000000001xxxxxxxxxxxxxxxxxxxxxxPQR
// 12345678901234567890123456
// 1 = msb 1 bit
// P = bit dstMantDig-1 bits to the right of 1
// Q = bit dstMantDig bits to the right of 1
// R = "or" of all bits to the right of Q
if (sd == dstMantDig + 1) {
a <<= 1;
} else if (sd == dstMantDig + 2) {
// Do nothing.
} else {
a = ((usrc_t)a >> (sd - (dstMantDig + 2))) |
((a & ((usrc_t)(-1) >> ((srcBits + dstMantDig + 2) - sd))) != 0);
}
// finish:
a |= (a & 4) != 0; // Or P into R
++a; // round - this step may add a significant bit
a >>= 2; // dump Q and R
// a is now rounded to dstMantDig or dstMantDig+1 bits
if (a & ((usrc_t)1 << dstMantDig)) {
a >>= 1;
++e;
}
// a is now rounded to dstMantDig bits
} else {
a <<= (dstMantDig - sd);
// a is now rounded to dstMantDig bits
}
const int dstBits = sizeof(dst_t) * CHAR_BIT;
const dst_rep_t dstSignMask = DST_REP_C(1) << (dstBits - 1);
const int dstExpBits = dstBits - dstSigBits - 1;
const int dstExpBias = (1 << (dstExpBits - 1)) - 1;
const dst_rep_t dstSignificandMask = (DST_REP_C(1) << dstSigBits) - 1;
// Combine sign, exponent, and mantissa.
const dst_rep_t result = ((dst_rep_t)s & dstSignMask) | ((dst_rep_t)(e + dstExpBias) << dstSigBits) |
((dst_rep_t)(a)&dstSignificandMask);

const union {
dst_t f;
dst_rep_t i;
} rep = {.i = result};

DCHECK(std::abs(rep.f - __real___floattidf(a)) < 0.001);
return rep.f;
}
} // namespace starrocks
22 changes: 22 additions & 0 deletions be/src/runtime/int128_to_double.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
namespace starrocks {
extern "C" {
// origin from llvm-project
// https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/int_to_fp_impl.inc
// this implementation is 20x faster than gcc
double __wrap___floattidf(__int128 a);
double __real___floattidf(__int128 a);
}
} // namespace starrocks
20 changes: 1 addition & 19 deletions be/src/runtime/integer_overflow_arithmetics.h
Original file line number Diff line number Diff line change
Expand Up @@ -113,25 +113,7 @@ inline int clz128(unsigned __int128 v) {
}

inline bool int128_mul_overflow(int128_t a, int128_t b, int128_t* c) {
if (a == 0 || b == 0) {
*c = 0;
return false;
}

// sgn(x)
auto sa = a >> 127;
// sgn(y)
auto sb = b >> 127;
// abx(x), abs(y)
a = (a ^ sa) - sa;
b = (b ^ sb) - sb;
// sgn(x * y)
sa ^= sb;
*c = a * b;
// sgn(x * y) and abs(x) * abs(y) produces x * y;
*c = (*c ^ sa) - sa;
static constexpr auto int128_max = get_max<int128_t>();
return clz128(a) + clz128(b) < sizeof(int128_t) || int128_max / a < b;
return __builtin_mul_overflow(a, b, c);
}

template <>
Expand Down
32 changes: 31 additions & 1 deletion thirdparty/patches/streamvbyte.patch
Original file line number Diff line number Diff line change
@@ -1,13 +1,43 @@
From 676d0175085a7996f909d9d2e63ab7b4683ef475 Mon Sep 17 00:00:00 2001
From: before-Sunrise <[email protected]>
Date: Tue, 14 Jan 2025 18:41:46 +0800
Subject: [PATCH] patch

Signed-off-by: before-Sunrise <[email protected]>
---
CMakeLists.txt | 5 +----
include/streamvbyte.h | 2 +-
2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 39df85d..1e32b0c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,10 +32,7 @@ if (MSVC)
endif()
# test for arm
if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)")
- set(BASE_FLAGS
- ${BASE_FLAGS}
- "-D__ARM_NEON__"
- )
+ add_compile_options(-D__ARM_NEON__)
endif()
set(STREAMVBYTE_SRCS
${PROJECT_SOURCE_DIR}/src/streamvbyte_encode.c
diff --git a/include/streamvbyte.h b/include/streamvbyte.h
index bc9533c..a6cbb1a 100644
--- a/include/streamvbyte.h
+++ b/include/streamvbyte.h
@@ -1,7 +1,7 @@

#ifndef INCLUDE_STREAMVBYTE_H_
#define INCLUDE_STREAMVBYTE_H_
-#define __STDC_FORMAT_MACROS
+// #define __STDC_FORMAT_MACROS
#include <inttypes.h>
#include <stdint.h>// please use a C99-compatible compiler
#include <stddef.h>
--
2.34.1

0 comments on commit e88bb85

Please sign in to comment.