summaryrefslogtreecommitdiff
path: root/lib/igt_x86.c
diff options
context:
space:
mode:
authorChris Wilson <chris@chris-wilson.co.uk>2018-02-27 21:45:14 +0000
committerChris Wilson <chris@chris-wilson.co.uk>2018-03-01 08:46:14 +0000
commit6a06d014a38fc8282f7dda7c11e5cacd9daf44ca (patch)
tree3ed22f665ea29a563f9298618ea5d119e2d5e24d /lib/igt_x86.c
parent5aed726a723d0abd42e36a26dd6349739fefd568 (diff)
lib: Provide an accelerated routine for readback from WC
Reading from WC is awfully slow as each access is uncached and so performed synchronously, stalling for the memory load. x86 did introduce some new instructions in SSE 4.1 to provide a small internal buffer to accelerate reading back a cacheline at a time from uncached memory, for this purpose. v2: Don't be lazy and handle misalignment. v3: Switch out of sse41 before emitting the generic memcpy routine v4: Replace opencoded memcpy_from_wc v5: Always flush the internal buffer before use (Eric) v6: Assume bulk moves, so check for dst alignment. v7: Use _mm_fence for _buitlin_ia32_mfence for consistency, remove superfluous defines (Ville) Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Eric Anholt <eric@anholt.net> Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Diffstat (limited to 'lib/igt_x86.c')
-rw-r--r--lib/igt_x86.c116
1 files changed, 116 insertions, 0 deletions
diff --git a/lib/igt_x86.c b/lib/igt_x86.c
index 0ed3c6f1..cb1e0a72 100644
--- a/lib/igt_x86.c
+++ b/lib/igt_x86.c
@@ -36,7 +36,11 @@
#endif
#include "igt_x86.h"
+#include "igt_aux.h"
+
+#include <stdint.h>
#include <stdio.h>
+#include <string.h>
/**
* SECTION:igt_x86
@@ -174,3 +178,115 @@ char *igt_x86_features_to_string(unsigned features, char *line)
return ret;
}
#endif
+
+#if defined(__x86_64__) && !defined(__clang__)
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#pragma GCC diagnostic ignored "-Wpointer-arith"
+
+#include <smmintrin.h>
+static void memcpy_from_wc_sse41(void *dst, const void *src, unsigned long len)
+{
+ char buf[16];
+
+ /* Flush the internal buffer of potential stale gfx data */
+ _mm_mfence();
+
+ if ((uintptr_t)src & 15) {
+ __m128i *S = (__m128i *)((uintptr_t)src & ~15);
+ unsigned long misalign = (uintptr_t)src & 15;
+ unsigned long copy = min(len, 16 - misalign);
+
+ _mm_storeu_si128((__m128i *)buf,
+ _mm_stream_load_si128(S));
+
+ memcpy(dst, buf + misalign, copy);
+
+ dst += copy;
+ src += copy;
+ len -= copy;
+ }
+
+ /* We assume we are doing bulk transfers, so prefer aligned moves */
+ if (((uintptr_t)dst & 15) == 0) {
+ while (len >= 64) {
+ __m128i *S = (__m128i *)src;
+ __m128i *D = (__m128i *)dst;
+ __m128i tmp[4];
+
+ tmp[0] = _mm_stream_load_si128(S + 0);
+ tmp[1] = _mm_stream_load_si128(S + 1);
+ tmp[2] = _mm_stream_load_si128(S + 2);
+ tmp[3] = _mm_stream_load_si128(S + 3);
+
+ _mm_store_si128(D + 0, tmp[0]);
+ _mm_store_si128(D + 1, tmp[1]);
+ _mm_store_si128(D + 2, tmp[2]);
+ _mm_store_si128(D + 3, tmp[3]);
+
+ src += 64;
+ dst += 64;
+ len -= 64;
+ }
+ } else {
+ while (len >= 64) {
+ __m128i *S = (__m128i *)src;
+ __m128i *D = (__m128i *)dst;
+ __m128i tmp[4];
+
+ tmp[0] = _mm_stream_load_si128(S + 0);
+ tmp[1] = _mm_stream_load_si128(S + 1);
+ tmp[2] = _mm_stream_load_si128(S + 2);
+ tmp[3] = _mm_stream_load_si128(S + 3);
+
+ _mm_storeu_si128(D + 0, tmp[0]);
+ _mm_storeu_si128(D + 1, tmp[1]);
+ _mm_storeu_si128(D + 2, tmp[2]);
+ _mm_storeu_si128(D + 3, tmp[3]);
+
+ src += 64;
+ dst += 64;
+ len -= 64;
+ }
+ }
+
+ while (len >= 16) {
+ _mm_storeu_si128((__m128i *)dst,
+ _mm_stream_load_si128((__m128i *)src));
+
+ src += 16;
+ dst += 16;
+ len -= 16;
+ }
+
+ if (len) {
+ _mm_storeu_si128((__m128i *)buf,
+ _mm_stream_load_si128((__m128i *)src));
+ memcpy(dst, buf, len);
+ }
+}
+
+#pragma GCC pop_options
+
+static void memcpy_from_wc(void *dst, const void *src, unsigned long len)
+{
+ memcpy(dst, src, len);
+}
+
+static void (*resolve_memcpy_from_wc(void))(void *, const void *, unsigned long)
+{
+ if (igt_x86_features() & SSE4_1)
+ return memcpy_from_wc_sse41;
+
+ return memcpy_from_wc;
+}
+
+void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
+ __attribute__((ifunc("resolve_memcpy_from_wc")));
+
+#else
+void igt_memcpy_from_wc(void *dst, const void *src, unsigned long len)
+{
+ memcpy(dst, src, len);
+}
+#endif