x11-libs/pixman: Bump and patch to latest git head

Use 0.21.6 as a base, and patch in git for NEON optimizations
This commit is contained in:
Steev Klimaszewski
2011-03-13 05:57:04 -05:00
committed by steev
parent cfef46b0ea
commit 2a67d77463
23 changed files with 3015 additions and 0 deletions

23
x11-libs/pixman/Manifest Normal file
View File

@@ -0,0 +1,23 @@
AUX 0002-Fix-compilation-on-Win32.patch 1373 RMD160 27ab9d8e5ee15ca0ca2316c2088488f77cc04193 SHA1 b60a844b6f97405d5974838126ce1a581a5578fb SHA256 c167e98ac16db2f09d11e71b6acfc2436ea06ad5f5e91d829463e8a2428c8f1d
AUX 0003-test-Fix-tests-for-compilation-on-Windows.patch 5857 RMD160 c03eda4e6678e85da3d3aecb6f8af77bbbcba396 SHA1 78bc36587fabe4e5d984c1535d60bac1f27665bd SHA256 c17670b7a3603e3591e5f3264441b01dd1861c24681cea9bb63c1a4896471f09
AUX 0004-test-Add-Makefile-for-Win32.patch 2285 RMD160 956800336268328f68cbf80fcf0f1e1a8254ed41 SHA1 9bf6452ad0982af23ead0ebfff1c3a46ab8a1454 SHA256 23b281492ad50c090c3ae3d501f92a039edcd35b4019ba60566394a9b9c99a41
AUX 0005-Do-not-include-unused-headers.patch 1138 RMD160 e73bcdb3d39a3fe29a8d61fec12facbe0c15bb1f SHA1 77e320b1f0702e6b31214a7057c759f0cdec37fd SHA256 036fad75930a7a5981d0fe58749c1d1c7b066931d1bbcb7695ad8f45208c66e3
AUX 0006-test-Silence-MSVC-warnings.patch 1879 RMD160 1ee25d5477740736c3bbb1c925f14fa45b9baea9 SHA1 17b823ed9bca1423ce3e7df6384820cff5b2c4f7 SHA256 dd835cb47e6f54c7295e181c8cab32924f5b7aa79be630b1dffd4987b04535a2
AUX 0007-Main-loop-template-for-fast-single-pass-bilinear-sca.patch 18432 RMD160 45c3975ea38fdb4fe9ed927c60a020cf65c30726 SHA1 45023ba64a3c48d73d8d43b70dd38fa885b7ba7f SHA256 b96ae6c8bea2a900dd013f134f5223bf415fdc9f492f3854ee2b095451276857
AUX 0008-test-check-correctness-of-bilinear_pad_repeat_get_sc.patch 3637 RMD160 a8b2d0ddc050a521c1510b0f34c465e6e17d8b1d SHA1 f6b92ea26d7773cc826d63c175742194523b8480 SHA256 aa1354d2395925d53108269dc7f45ca4c16509318af794704c458339541d1ea8
AUX 0009-SSE2-optimization-for-bilinear-scaled-src_8888_8888.patch 5981 RMD160 577a6b80b87d4687798e86bd9fe777a536ca9d76 SHA1 ffe4dfd7b3464bf6271ae869483124f4b8df7fd7 SHA256 ab8e918705c5d8bc24944a9b34f1a6d941d6f88cd16db4476566cb4bbf535039
AUX 0010-ARM-NEON-optimization-for-bilinear-scaled-src_8888_8.patch 9330 RMD160 8601746564959d01c01993bd359c4b4db0beec8f SHA1 76b52d88701def10885f9f0592dea3b19707f6c1 SHA256 2da797eebe471eabca3da195ea295faf462c3f38330a11eee18e7247f7370477
AUX 0011-test-In-image_endian_swap-use-pixman_image_get_forma.patch 4741 RMD160 9054134e9c656a955c595bf004e51fc5652687fd SHA1 7c361a399fa4a20ca8e413b2cf0f29847d519cb1 SHA256 9c8fb1eb06e054fb0fdfece9c33e0b311a3949ba3550c4a95f5943a914e7a770
AUX 0012-test-Do-endian-swapping-of-the-source-and-destinatio.patch 1080 RMD160 6ca0c5e9597a765f03ac719357d9add04acdecea SHA1 6f74002a839afb6e5d91a8565776edfe19a29f07 SHA256 af280e15b33683841a7df486c8bbd21c9268958865652bdaa6389fdd3909a457
AUX 0013-ARM-use-prefetch-in-nearest-scaled-src_0565_0565.patch 2486 RMD160 cdb29a25ba6726bda75db4c2a37b29d628957085 SHA1 12465119abc6ae4aba91fb4ffa5e21c7d3044ad4 SHA256 88bc1c5118b1550f5b59a16a511da2675697f79bed863368d284e4a1b260f833
AUX 0014-ARM-common-macro-for-nearest-scaling-fast-paths.patch 4606 RMD160 2f826d65e0e3b80ad0fe3371137df128336c257f SHA1 75b051949e226dea0ac55d9a2618b688808793e8 SHA256 85d417a2160944b774dad489432b637a5662e0416a3919f095a93607772278ac
AUX 0015-ARM-assembly-optimized-nearest-scaled-src_8888_8888.patch 2712 RMD160 8d335c0f1585ad9882fdef63c14328f36a2a7eac SHA1 f0f024b9248a85bc59a503a0088c24a9b97b0646 SHA256 bd97cb792274b8d6d498f07479a314b2ef1d9059b11acc3979871d38abb30ceb
AUX 0016-ARM-new-bilinear-fast-path-template-macro-in-pixman-.patch 7188 RMD160 4fa7ee0d71533a47d699b78e8af9e83d60dae450 SHA1 7fa1dea003735971baf7f199a240c8eec6917f30 SHA256 6b99d26015f8953bf43cb3a7495a02efec4b807ce23166ccd5faca711acc2475
AUX 0017-ARM-NEON-common-macro-template-for-bilinear-scanline.patch 8195 RMD160 4cff07d6bd52966a57148064f4a7a2a9da73838f SHA1 fdd5b62e8f33d0f3f245d86fb1567e26c829a051 SHA256 b17b03a4f7516de8bd803320310b26b2c09b694730ae4eddb5ae56a092da03bb
AUX 0018-ARM-use-common-macro-template-for-bilinear-scaled-sr.patch 6752 RMD160 53aaedec3527bd2a280b4b503504233306f92505 SHA1 db2606645ad5eb1bb0ec42b14116ef3aed0b9a3f SHA256 671c3c9d910f4ed8631149b31795879d35399679b8136847f6dbc94dda885a6d
AUX 0019-ARM-NEON-optimization-for-bilinear-scaled-src_8888_0.patch 2083 RMD160 e4ba51ef2842f4cf42acbfc85dc2ecb09fabe655 SHA1 ca597361fc9da1d4b74a875c145af1cb7e4abb34 SHA256 929326bc1eca3e45a9a42dca9890e9a5422621e0587470a0eb28ed088ff097b3
AUX 0020-ARM-NEON-optimization-for-bilinear-scaled-src_0565_x.patch 2063 RMD160 222b88cd6453fb4adc7f4c14b7b9d89a2c0e5c6d SHA1 f14961b189217cdb125d6ec8c002e8237c7f804a SHA256 ce0e82c68cef5fd9b4f7557ebb8c903bda7b09b496b04f6b7b229f42d7cd4bf0
AUX 0021-ARM-NEON-optimization-for-bilinear-scaled-src_0565_0.patch 2058 RMD160 316bc2158c3dda9edef498582b316020e2a5e3ed SHA1 54d123fdb551a0dede24f5d53fa98e359d5d5d13 SHA256 c540fd8abdb2a23d445f53f0ff35905b27e4104fd4d9c9d59cce2d68d4970a58
AUX 0022-ARM-a-bit-faster-NEON-bilinear-scaling-for-r5g6b5-so.patch 5971 RMD160 5a6956cbf719d928d1e6d5a4cd07f0ec22a62c41 SHA1 f04f6938337145bde6410ad8983e8a7e3749d9fa SHA256 5320bdbf0d1ddd3b753dedae61d5785b9db27ccdaeb49880508138c0e6113f57
DIST pixman-0.21.6.tar.bz2 457580 RMD160 6ad5979d123e0268426c08954fd7f6040f7a3859 SHA1 73198f8f9159e3ffc2294806f32fa2c8042b57e6 SHA256 35a9fc00fc55c022318a7ac48eb52de60360beec36008b0037f944f3d0d62e83
EBUILD pixman-0.21.6.ebuild 3157 RMD160 9188bc6dac468b30d4888a5a507d2c3145cb36c9 SHA1 a2b33238bc243e1e99b298e65d76482b2d32e73a SHA256 060cb6f797fa67bee354865a599f240150fe80373bab99f164dfd27d10aa76f2

View File

@@ -0,0 +1,42 @@
From 20ed723a5a42fb8636bc9a5f32974dec1b66a785 Mon Sep 17 00:00:00 2001
From: Andrea Canciani <ranma42@gmail.com>
Date: Thu, 24 Feb 2011 10:44:04 +0100
Subject: [PATCH 02/22] Fix compilation on Win32
Makefile.win32 contained a typo and was missing the dependency from
the built sources.
---
pixman/Makefile.win32 | 6 ++++--
1 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/pixman/Makefile.win32 b/pixman/Makefile.win32
index 775fb5e..b5f9397 100644
--- a/pixman/Makefile.win32
+++ b/pixman/Makefile.win32
@@ -56,6 +56,8 @@ SOURCES = \
pixman-general.c \
$(NULL)
+BUILT_SOURCES = pixman-combine32.h pixman-combine32.c pixman-combine64.h pixman-combine64.c
+
# MMX compilation flags
ifeq ($(MMX_VAR),on)
CFLAGS += $(MMX_CFLAGS)
@@ -122,7 +124,7 @@ endif
endif
# pixman compilation and linking
-$(CFG_VAR)/%.obj: %.c
+$(CFG_VAR)/%.obj: %.c $(BUILT_SOURCES)
@mkdir -p $(CFG_VAR)
@$(CC) -c $(CFLAGS) -Fo"$@" $<
@@ -141,4 +143,4 @@ pixman-combine64.h: pixman-combine.h.template make-combine.pl
clean_r:
@rm -f $(CFG_VAR)/*.obj $(CFG_VAR)/*.lib $(CFG_VAR)/*.pdb $(CFG)/*.ilk || exit 0
- @rm -f $(CFG)/*.obj $(CFG)/*.lib $(CFG)/*.pdb $(CFG)/*.ilk pixman-combine32.c pixman-combine64.c pixman-combine64.c pixman-combine64.h || exit 0
+ @rm -f $(CFG)/*.obj $(CFG)/*.lib $(CFG)/*.pdb $(CFG)/*.ilk $(BUILT_SOURCES) || exit 0
--
1.7.3.4

View File

@@ -0,0 +1,232 @@
From 11305b4ecdd36a17592c5c75de9157874853ab20 Mon Sep 17 00:00:00 2001
From: Andrea Canciani <ranma42@gmail.com>
Date: Tue, 22 Feb 2011 21:46:37 +0100
Subject: [PATCH 03/22] test: Fix tests for compilation on Windows
The Microsoft C compiler cannot handle subobject initialization and
Win32 does not provide snprintf.
Work around these limitations by using normal struct initialization
and using sprintf (a manual check shows that the buffer size is
sufficient).
---
test/composite.c | 29 +++++++++++++--------------
test/fetch-test.c | 52 ++++++++++++++++++++++----------------------------
test/trap-crasher.c | 20 +++++++++---------
3 files changed, 47 insertions(+), 54 deletions(-)
diff --git a/test/composite.c b/test/composite.c
index e14f954..08c6689 100644
--- a/test/composite.c
+++ b/test/composite.c
@@ -617,18 +617,18 @@ eval_diff (color_t *expected, color_t *test, pixman_format_code_t format)
}
static char *
-describe_image (image_t *info, char *buf, int buflen)
+describe_image (image_t *info, char *buf)
{
if (info->size)
{
- snprintf (buf, buflen, "%s %dx%d%s",
- info->format->name,
- info->size, info->size,
- info->repeat ? "R" :"");
+ sprintf (buf, "%s %dx%d%s",
+ info->format->name,
+ info->size, info->size,
+ info->repeat ? "R" :"");
}
else
{
- snprintf (buf, buflen, "solid");
+ sprintf (buf, "solid");
}
return buf;
@@ -710,10 +710,9 @@ composite_test (image_t *dst,
{
char buf[40];
- snprintf (buf, sizeof (buf),
- "%s %scomposite",
- op->name,
- component_alpha ? "CA " : "");
+ sprintf (buf, "%s %scomposite",
+ op->name,
+ component_alpha ? "CA " : "");
printf ("%s test error of %.4f --\n"
" R G B A\n"
@@ -735,9 +734,9 @@ composite_test (image_t *dst,
mask->color->b, mask->color->a,
dst->color->r, dst->color->g,
dst->color->b, dst->color->a);
- printf ("src: %s, ", describe_image (src, buf, sizeof (buf)));
- printf ("mask: %s, ", describe_image (mask, buf, sizeof (buf)));
- printf ("dst: %s\n\n", describe_image (dst, buf, sizeof (buf)));
+ printf ("src: %s, ", describe_image (src, buf));
+ printf ("mask: %s, ", describe_image (mask, buf));
+ printf ("dst: %s\n\n", describe_image (dst, buf));
}
else
{
@@ -747,8 +746,8 @@ composite_test (image_t *dst,
src->color->b, src->color->a,
dst->color->r, dst->color->g,
dst->color->b, dst->color->a);
- printf ("src: %s, ", describe_image (src, buf, sizeof (buf)));
- printf ("dst: %s\n\n", describe_image (dst, buf, sizeof (buf)));
+ printf ("src: %s, ", describe_image (src, buf));
+ printf ("dst: %s\n\n", describe_image (dst, buf));
}
success = FALSE;
diff --git a/test/fetch-test.c b/test/fetch-test.c
index 2ca16dd..314a072 100644
--- a/test/fetch-test.c
+++ b/test/fetch-test.c
@@ -8,7 +8,7 @@
static pixman_indexed_t mono_palette =
{
- .rgba = { 0x00000000, 0x00ffffff },
+ 0, { 0x00000000, 0x00ffffff },
};
@@ -24,57 +24,53 @@ typedef struct {
static testcase_t testcases[] =
{
{
- .format = PIXMAN_a8r8g8b8,
- .width = 2, .height = 2,
- .stride = 8,
- .src = { 0x00112233, 0x44556677,
- 0x8899aabb, 0xccddeeff },
- .dst = { 0x00112233, 0x44556677,
- 0x8899aabb, 0xccddeeff },
- .indexed = NULL,
+ PIXMAN_a8r8g8b8,
+ 2, 2,
+ 8,
+ { 0x00112233, 0x44556677,
+ 0x8899aabb, 0xccddeeff },
+ { 0x00112233, 0x44556677,
+ 0x8899aabb, 0xccddeeff },
+ NULL,
},
{
- .format = PIXMAN_g1,
- .width = 8, .height = 2,
- .stride = 4,
+ PIXMAN_g1,
+ 8, 2,
+ 4,
#ifdef WORDS_BIGENDIAN
- .src =
{
0xaa000000,
0x55000000
},
#else
- .src =
{
0x00000055,
0x000000aa
},
#endif
- .dst =
{
0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000,
0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff
},
- .indexed = &mono_palette,
+ &mono_palette,
},
#if 0
{
- .format = PIXMAN_g8,
- .width = 4, .height = 2,
- .stride = 4,
- .src = { 0x01234567,
- 0x89abcdef },
- .dst = { 0x00010101, 0x00232323, 0x00454545, 0x00676767,
- 0x00898989, 0x00ababab, 0x00cdcdcd, 0x00efefef, },
+ PIXMAN_g8,
+ 4, 2,
+ 4,
+ { 0x01234567,
+ 0x89abcdef },
+ { 0x00010101, 0x00232323, 0x00454545, 0x00676767,
+ 0x00898989, 0x00ababab, 0x00cdcdcd, 0x00efefef, },
},
#endif
/* FIXME: make this work on big endian */
{
- .format = PIXMAN_yv12,
- .width = 8, .height = 2,
- .stride = 8,
+ PIXMAN_yv12,
+ 8, 2,
+ 8,
#ifdef WORDS_BIGENDIAN
- .src =
{
0x00ff00ff, 0x00ff00ff,
0xff00ff00, 0xff00ff00,
@@ -82,7 +78,6 @@ static testcase_t testcases[] =
0x800080ff
},
#else
- .src =
{
0xff00ff00, 0xff00ff00,
0x00ff00ff, 0x00ff00ff,
@@ -90,7 +85,6 @@ static testcase_t testcases[] =
0xff800080
},
#endif
- .dst =
{
0xff000000, 0xffffffff, 0xffb80000, 0xffffe113,
0xff000000, 0xffffffff, 0xff0023ee, 0xff4affff,
diff --git a/test/trap-crasher.c b/test/trap-crasher.c
index 42b82f6..7485e62 100644
--- a/test/trap-crasher.c
+++ b/test/trap-crasher.c
@@ -7,21 +7,21 @@ main()
pixman_image_t *dst;
pixman_trapezoid_t traps[1] = {
{
- .top = 2147483646,
- .bottom = 2147483647,
- .left = {
- .p1 = { .x = 0, .y = 0 },
- .p2 = { .x = 0, .y = 2147483647 }
+ 2147483646,
+ 2147483647,
+ {
+ { 0, 0 },
+ { 0, 2147483647 }
},
- .right = {
- .p1 = { .x = 65536, .y = 0 },
- .p2 = { .x = 0, .y = 2147483647 }
+ {
+ { 65536, 0 },
+ { 0, 2147483647 }
}
},
};
-
+
dst = pixman_image_create_bits (PIXMAN_a8, 1, 1, NULL, -1);
-
+
pixman_add_trapezoids (dst, 0, 0, sizeof (traps)/sizeof (traps[0]), traps);
return (0);
}
--
1.7.3.4

View File

@@ -0,0 +1,92 @@
From 72f5e5f608506c18c484bc5bc3e58bd83aeb7691 Mon Sep 17 00:00:00 2001
From: Andrea Canciani <ranma42@gmail.com>
Date: Tue, 22 Feb 2011 22:04:49 +0100
Subject: [PATCH 04/22] test: Add Makefile for Win32
---
test/Makefile.win32 | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 73 insertions(+), 0 deletions(-)
create mode 100644 test/Makefile.win32
diff --git a/test/Makefile.win32 b/test/Makefile.win32
new file mode 100644
index 0000000..c71afe1
--- /dev/null
+++ b/test/Makefile.win32
@@ -0,0 +1,73 @@
+CC = cl
+LINK = link
+
+CFG_VAR = $(CFG)
+ifeq ($(CFG_VAR),)
+CFG_VAR=release
+endif
+
+CFLAGS = -MD -nologo -D_CRT_SECURE_NO_DEPRECATE -D_CRT_NONSTDC_NO_DEPRECATE -D_BIND_TO_CURRENT_VCLIBS_VERSION -D_MT -I../pixman -I. -I../
+TEST_LDADD = ../pixman/$(CFG_VAR)/pixman-1.lib
+INCLUDES = -I../pixman -I$(top_builddir)/pixman
+
+# optimization flags
+ifeq ($(CFG_VAR),debug)
+CFLAGS += -Od -Zi
+else
+CFLAGS += -O2
+endif
+
+SOURCES = \
+ a1-trap-test.c \
+ pdf-op-test.c \
+ region-test.c \
+ region-translate-test.c \
+ fetch-test.c \
+ oob-test.c \
+ trap-crasher.c \
+ alpha-loop.c \
+ scaling-crash-test.c \
+ gradient-crash-test.c \
+ alphamap.c \
+ stress-test.c \
+ composite-traps-test.c \
+ blitters-test.c \
+ scaling-test.c \
+ affine-test.c \
+ composite.c \
+ utils.c
+
+TESTS = \
+ $(CFG_VAR)/a1-trap-test.exe \
+ $(CFG_VAR)/pdf-op-test.exe \
+ $(CFG_VAR)/region-test.exe \
+ $(CFG_VAR)/region-translate-test.exe \
+ $(CFG_VAR)/fetch-test.exe \
+ $(CFG_VAR)/oob-test.exe \
+ $(CFG_VAR)/trap-crasher.exe \
+ $(CFG_VAR)/alpha-loop.exe \
+ $(CFG_VAR)/scaling-crash-test.exe \
+ $(CFG_VAR)/gradient-crash-test.exe \
+ $(CFG_VAR)/alphamap.exe \
+ $(CFG_VAR)/stress-test.exe \
+ $(CFG_VAR)/composite-traps-test.exe \
+ $(CFG_VAR)/blitters-test.exe \
+ $(CFG_VAR)/scaling-test.exe \
+ $(CFG_VAR)/affine-test.exe \
+ $(CFG_VAR)/composite.exe
+
+
+OBJECTS = $(patsubst %.c, $(CFG_VAR)/%.obj, $(SOURCES))
+
+$(CFG_VAR)/%.obj: %.c
+ @mkdir -p $(CFG_VAR)
+ @$(CC) -c $(CFLAGS) -Fo"$@" $<
+
+$(CFG_VAR)/%.exe: $(CFG_VAR)/%.obj
+ $(LINK) /NOLOGO /OUT:$@ $< $(CFG_VAR)/utils.obj $(TEST_LDADD)
+
+all: $(OBJECTS) $(TESTS)
+ @exit 0
+
+clean:
+ @rm -f $(CFG_VAR)/*.obj $(CFG_VAR)/*.pdb || exit 0
--
1.7.3.4

View File

@@ -0,0 +1,40 @@
From 8868778ea1fdc8e70da76b3b00ea78106c5840d8 Mon Sep 17 00:00:00 2001
From: Andrea Canciani <ranma42@gmail.com>
Date: Tue, 22 Feb 2011 22:43:48 +0100
Subject: [PATCH 05/22] Do not include unused headers
pixman-combine32.h is included without being used both in
pixman-image.c and in pixman-general.c.
---
pixman/pixman-general.c | 2 --
pixman/pixman-image.c | 1 -
2 files changed, 0 insertions(+), 3 deletions(-)
diff --git a/pixman/pixman-general.c b/pixman/pixman-general.c
index 16ea3a4..872fb7e 100644
--- a/pixman/pixman-general.c
+++ b/pixman/pixman-general.c
@@ -36,8 +36,6 @@
#include <stdlib.h>
#include <string.h>
#include "pixman-private.h"
-#include "pixman-combine32.h"
-#include "pixman-private.h"
static void
general_src_iter_init (pixman_implementation_t *imp,
diff --git a/pixman/pixman-image.c b/pixman/pixman-image.c
index 9103ca6..84bacf8 100644
--- a/pixman/pixman-image.c
+++ b/pixman/pixman-image.c
@@ -30,7 +30,6 @@
#include <assert.h>
#include "pixman-private.h"
-#include "pixman-combine32.h"
pixman_bool_t
_pixman_init_gradient (gradient_t * gradient,
--
1.7.3.4

View File

@@ -0,0 +1,63 @@
From 9ebde285fa990bfa1524f166fbfb1368c346b14a Mon Sep 17 00:00:00 2001
From: Andrea Canciani <ranma42@gmail.com>
Date: Thu, 24 Feb 2011 12:53:39 +0100
Subject: [PATCH 06/22] test: Silence MSVC warnings
MSVC does not notice non-returning functions (abort() / assert(0))
and warns about paths which end with them in non-void functions:
c:\cygwin\home\ranma42\code\fdo\pixman\test\fetch-test.c(114) :
warning C4715: 'reader' : not all control paths return a value
c:\cygwin\home\ranma42\code\fdo\pixman\test\stress-test.c(133) :
warning C4715: 'real_reader' : not all control paths return a value
c:\cygwin\home\ranma42\code\fdo\pixman\test\composite.c(431) :
warning C4715: 'calc_op' : not all control paths return a value
These warnings can be silenced by adding a return after the
termination call.
---
test/composite.c | 1 +
test/fetch-test.c | 1 +
test/stress-test.c | 2 +-
3 files changed, 3 insertions(+), 1 deletions(-)
diff --git a/test/composite.c b/test/composite.c
index 08c6689..a86e5ed 100644
--- a/test/composite.c
+++ b/test/composite.c
@@ -426,6 +426,7 @@ calc_op (pixman_op_t op, double src, double dst, double srca, double dsta)
case PIXMAN_OP_HSL_LUMINOSITY:
default:
abort();
+ return 0; /* silence MSVC */
}
#undef mult_chan
}
diff --git a/test/fetch-test.c b/test/fetch-test.c
index 314a072..60bc765 100644
--- a/test/fetch-test.c
+++ b/test/fetch-test.c
@@ -110,6 +110,7 @@ reader (const void *src, int size)
return *(uint32_t *)src;
default:
assert(0);
+ return 0; /* silence MSVC */
}
}
diff --git a/test/stress-test.c b/test/stress-test.c
index bcbc1f8..166dc6d 100644
--- a/test/stress-test.c
+++ b/test/stress-test.c
@@ -128,7 +128,7 @@ real_reader (const void *src, int size)
return *(uint32_t *)src;
default:
assert (0);
- break;
+ return 0; /* silence MSVC */
}
}
--
1.7.3.4

View File

@@ -0,0 +1,466 @@
From d506bf68fd0e9a1c5dd484daee70631699918387 Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Mon, 21 Feb 2011 01:29:02 +0200
Subject: [PATCH 07/22] Main loop template for fast single pass bilinear scaling
Can be used for implementing SIMD optimized fast path
functions which work with bilinear scaled source images.
Similar to the template for nearest scaling main loop, the
following types of mask are supported:
1. no mask
2. non-scaled a8 mask with SAMPLES_COVER_CLIP flag
3. solid mask
PAD repeat is fully supported. NONE repeat is partially
supported (right now only works if source image has alpha
channel or when alpha channel of the source image does not
have any effect on the compositing operation).
---
pixman/pixman-fast-path.h | 432 +++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 432 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-fast-path.h b/pixman/pixman-fast-path.h
index d081222..1885d47 100644
--- a/pixman/pixman-fast-path.h
+++ b/pixman/pixman-fast-path.h
@@ -587,4 +587,436 @@ fast_composite_scaled_nearest ## scale_func_name (pixman_implementation_t *imp,
SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE (op,s,d,func), \
SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD (op,s,d,func)
+/*****************************************************************************/
+
+/*
+ * Identify 5 zones in each scanline for bilinear scaling. Depending on
+ * whether 2 pixels to be interpolated are fetched from the image itself,
+ * from the padding area around it or from both image and padding area.
+ */
+static force_inline void
+bilinear_pad_repeat_get_scanline_bounds (int32_t source_image_width,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ int32_t * left_pad,
+ int32_t * left_tz,
+ int32_t * width,
+ int32_t * right_tz,
+ int32_t * right_pad)
+{
+ int width1 = *width, left_pad1, right_pad1;
+ int width2 = *width, left_pad2, right_pad2;
+
+ pad_repeat_get_scanline_bounds (source_image_width, vx, unit_x,
+ &width1, &left_pad1, &right_pad1);
+ pad_repeat_get_scanline_bounds (source_image_width, vx + pixman_fixed_1,
+ unit_x, &width2, &left_pad2, &right_pad2);
+
+ *left_pad = left_pad2;
+ *left_tz = left_pad1 - left_pad2;
+ *right_tz = right_pad2 - right_pad1;
+ *right_pad = right_pad1;
+ *width -= *left_pad + *left_tz + *right_tz + *right_pad;
+}
+
+/*
+ * Main loop template for single pass bilinear scaling. It needs to be
+ * provided with 'scanline_func' which should do the compositing operation.
+ * The needed function has the following prototype:
+ *
+ * scanline_func (dst_type_t * dst,
+ * const mask_type_ * mask,
+ * const src_type_t * src_top,
+ * const src_type_t * src_bottom,
+ * int32_t width,
+ * int weight_top,
+ * int weight_bottom,
+ * pixman_fixed_t vx,
+ * pixman_fixed_t unit_x,
+ * pixman_fixed_t max_vx,
+ * pixman_bool_t zero_src)
+ *
+ * Where:
+ * dst - destination scanline buffer for storing results
+ * mask - mask buffer (or single value for solid mask)
+ * src_top, src_bottom - two source scanlines
+ * width - number of pixels to process
+ * weight_top - weight of the top row for interpolation
+ * weight_bottom - weight of the bottom row for interpolation
+ * vx - initial position for fetching the first pair of
+ * pixels from the source buffer
+ * unit_x - position increment needed to move to the next pair
+ * of pixels
+ * max_vx - image size as a fixed point value, can be used for
+ * implementing NORMAL repeat (when it is supported)
+ * zero_src - boolean hint variable, which is set to TRUE when
+ * all source pixels are fetched from zero padding
+ * zone for NONE repeat
+ *
+ * Note: normally the sum of 'weight_top' and 'weight_bottom' is equal to 256,
+ * but sometimes it may be less than that for NONE repeat when handling
+ * fuzzy antialiased top or bottom image edges. Also both top and
+ * bottom weight variables are guaranteed to have value in 0-255
+ * range and can fit into unsigned byte or be used with 8-bit SIMD
+ * multiplication instructions.
+ */
+#define FAST_BILINEAR_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t, \
+ dst_type_t, repeat_mode, have_mask, mask_is_solid) \
+static void \
+fast_composite_scaled_bilinear ## scale_func_name (pixman_implementation_t *imp, \
+ pixman_op_t op, \
+ pixman_image_t * src_image, \
+ pixman_image_t * mask_image, \
+ pixman_image_t * dst_image, \
+ int32_t src_x, \
+ int32_t src_y, \
+ int32_t mask_x, \
+ int32_t mask_y, \
+ int32_t dst_x, \
+ int32_t dst_y, \
+ int32_t width, \
+ int32_t height) \
+{ \
+ dst_type_t *dst_line; \
+ mask_type_t *mask_line; \
+ src_type_t *src_first_line; \
+ int y1, y2; \
+ pixman_fixed_t max_vx = INT32_MAX; /* suppress uninitialized variable warning */ \
+ pixman_vector_t v; \
+ pixman_fixed_t vx, vy; \
+ pixman_fixed_t unit_x, unit_y; \
+ int32_t left_pad, left_tz, right_tz, right_pad; \
+ \
+ dst_type_t *dst; \
+ mask_type_t solid_mask; \
+ const mask_type_t *mask = &solid_mask; \
+ int src_stride, mask_stride, dst_stride; \
+ \
+ PIXMAN_IMAGE_GET_LINE (dst_image, dst_x, dst_y, dst_type_t, dst_stride, dst_line, 1); \
+ if (have_mask) \
+ { \
+ if (mask_is_solid) \
+ { \
+ solid_mask = _pixman_image_get_solid (imp, mask_image, dst_image->bits.format); \
+ mask_stride = 0; \
+ } \
+ else \
+ { \
+ PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type_t, \
+ mask_stride, mask_line, 1); \
+ } \
+ } \
+ /* pass in 0 instead of src_x and src_y because src_x and src_y need to be \
+ * transformed from destination space to source space */ \
+ PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1); \
+ \
+ /* reference point is the center of the pixel */ \
+ v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2; \
+ v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2; \
+ v.vector[2] = pixman_fixed_1; \
+ \
+ if (!pixman_transform_point_3d (src_image->common.transform, &v)) \
+ return; \
+ \
+ unit_x = src_image->common.transform->matrix[0][0]; \
+ unit_y = src_image->common.transform->matrix[1][1]; \
+ \
+ v.vector[0] -= pixman_fixed_1 / 2; \
+ v.vector[1] -= pixman_fixed_1 / 2; \
+ \
+ vy = v.vector[1]; \
+ \
+ if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD || \
+ PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE) \
+ { \
+ bilinear_pad_repeat_get_scanline_bounds (src_image->bits.width, v.vector[0], unit_x, \
+ &left_pad, &left_tz, &width, &right_tz, &right_pad); \
+ if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD) \
+ { \
+ /* PAD repeat does not need special handling for 'transition zones' and */ \
+ /* they can be combined with 'padding zones' safely */ \
+ left_pad += left_tz; \
+ right_pad += right_tz; \
+ left_tz = right_tz = 0; \
+ } \
+ v.vector[0] += left_pad * unit_x; \
+ } \
+ \
+ while (--height >= 0) \
+ { \
+ int weight1, weight2; \
+ dst = dst_line; \
+ dst_line += dst_stride; \
+ vx = v.vector[0]; \
+ if (have_mask && !mask_is_solid) \
+ { \
+ mask = mask_line; \
+ mask_line += mask_stride; \
+ } \
+ \
+ y1 = pixman_fixed_to_int (vy); \
+ weight2 = (vy >> 8) & 0xff; \
+ if (weight2) \
+ { \
+ /* normal case, both row weights are in 0-255 range and fit unsigned byte */ \
+ y2 = y1 + 1; \
+ weight1 = 256 - weight2; \
+ } \
+ else \
+ { \
+ /* set both top and bottom row to the same scanline, and weights to 128+128 */ \
+ y2 = y1; \
+ weight1 = weight2 = 128; \
+ } \
+ vy += unit_y; \
+ if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD) \
+ { \
+ src_type_t *src1, *src2; \
+ src_type_t buf1[2]; \
+ src_type_t buf2[2]; \
+ repeat (PIXMAN_REPEAT_PAD, &y1, src_image->bits.height); \
+ repeat (PIXMAN_REPEAT_PAD, &y2, src_image->bits.height); \
+ src1 = src_first_line + src_stride * y1; \
+ src2 = src_first_line + src_stride * y2; \
+ \
+ if (left_pad > 0) \
+ { \
+ buf1[0] = buf1[1] = src1[0]; \
+ buf2[0] = buf2[1] = src2[0]; \
+ scanline_func (dst, mask, \
+ buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, FALSE); \
+ dst += left_pad; \
+ if (have_mask && !mask_is_solid) \
+ mask += left_pad; \
+ } \
+ if (width > 0) \
+ { \
+ scanline_func (dst, mask, \
+ src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE); \
+ dst += width; \
+ if (have_mask && !mask_is_solid) \
+ mask += width; \
+ } \
+ if (right_pad > 0) \
+ { \
+ buf1[0] = buf1[1] = src1[src_image->bits.width - 1]; \
+ buf2[0] = buf2[1] = src2[src_image->bits.width - 1]; \
+ scanline_func (dst, mask, \
+ buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, FALSE); \
+ } \
+ } \
+ else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE) \
+ { \
+ src_type_t *src1, *src2; \
+ src_type_t buf1[2]; \
+ src_type_t buf2[2]; \
+ /* handle top/bottom zero padding by just setting weights to 0 if needed */ \
+ if (y1 < 0) \
+ { \
+ weight1 = 0; \
+ y1 = 0; \
+ } \
+ if (y1 >= src_image->bits.height) \
+ { \
+ weight1 = 0; \
+ y1 = src_image->bits.height - 1; \
+ } \
+ if (y2 < 0) \
+ { \
+ weight2 = 0; \
+ y2 = 0; \
+ } \
+ if (y2 >= src_image->bits.height) \
+ { \
+ weight2 = 0; \
+ y2 = src_image->bits.height - 1; \
+ } \
+ src1 = src_first_line + src_stride * y1; \
+ src2 = src_first_line + src_stride * y2; \
+ \
+ if (left_pad > 0) \
+ { \
+ buf1[0] = buf1[1] = 0; \
+ buf2[0] = buf2[1] = 0; \
+ scanline_func (dst, mask, \
+ buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, TRUE); \
+ dst += left_pad; \
+ if (have_mask && !mask_is_solid) \
+ mask += left_pad; \
+ } \
+ if (left_tz > 0) \
+ { \
+ buf1[0] = 0; \
+ buf1[1] = src1[0]; \
+ buf2[0] = 0; \
+ buf2[1] = src2[0]; \
+ scanline_func (dst, mask, \
+ buf1, buf2, left_tz, weight1, weight2, \
+ pixman_fixed_frac (vx), unit_x, 0, FALSE); \
+ dst += left_tz; \
+ if (have_mask && !mask_is_solid) \
+ mask += left_tz; \
+ vx += left_tz * unit_x; \
+ } \
+ if (width > 0) \
+ { \
+ scanline_func (dst, mask, \
+ src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE); \
+ dst += width; \
+ if (have_mask && !mask_is_solid) \
+ mask += width; \
+ vx += width * unit_x; \
+ } \
+ if (right_tz > 0) \
+ { \
+ buf1[0] = src1[src_image->bits.width - 1]; \
+ buf1[1] = 0; \
+ buf2[0] = src2[src_image->bits.width - 1]; \
+ buf2[1] = 0; \
+ scanline_func (dst, mask, \
+ buf1, buf2, right_tz, weight1, weight2, \
+ pixman_fixed_frac (vx), unit_x, 0, FALSE); \
+ dst += right_tz; \
+ if (have_mask && !mask_is_solid) \
+ mask += right_tz; \
+ } \
+ if (right_pad > 0) \
+ { \
+ buf1[0] = buf1[1] = 0; \
+ buf2[0] = buf2[1] = 0; \
+ scanline_func (dst, mask, \
+ buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, TRUE); \
+ } \
+ } \
+ else \
+ { \
+ scanline_func (dst, mask, src_first_line + src_stride * y1, \
+ src_first_line + src_stride * y2, width, \
+ weight1, weight2, vx, unit_x, max_vx, FALSE); \
+ } \
+ } \
+}
+
+/* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */
+#define FAST_BILINEAR_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t, \
+ dst_type_t, repeat_mode, have_mask, mask_is_solid) \
+ FAST_BILINEAR_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t,\
+ dst_type_t, repeat_mode, have_mask, mask_is_solid)
+
+#define SCALED_BILINEAR_FLAGS \
+ (FAST_PATH_SCALE_TRANSFORM | \
+ FAST_PATH_NO_ALPHA_MAP | \
+ FAST_PATH_BILINEAR_FILTER | \
+ FAST_PATH_NO_ACCESSORS | \
+ FAST_PATH_NARROW_FORMAT)
+
+#define SIMPLE_BILINEAR_FAST_PATH_PAD(op,s,d,func) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, \
+ (SCALED_BILINEAR_FLAGS | \
+ FAST_PATH_PAD_REPEAT | \
+ FAST_PATH_X_UNIT_POSITIVE), \
+ PIXMAN_null, 0, \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op, \
+ }
+
+#define SIMPLE_BILINEAR_FAST_PATH_NONE(op,s,d,func) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, \
+ (SCALED_BILINEAR_FLAGS | \
+ FAST_PATH_NONE_REPEAT | \
+ FAST_PATH_X_UNIT_POSITIVE), \
+ PIXMAN_null, 0, \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op, \
+ }
+
+#define SIMPLE_BILINEAR_FAST_PATH_COVER(op,s,d,func) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, \
+ SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP, \
+ PIXMAN_null, 0, \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op, \
+ }
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_PAD(op,s,d,func) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, \
+ (SCALED_BILINEAR_FLAGS | \
+ FAST_PATH_PAD_REPEAT | \
+ FAST_PATH_X_UNIT_POSITIVE), \
+ PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA), \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op, \
+ }
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NONE(op,s,d,func) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, \
+ (SCALED_BILINEAR_FLAGS | \
+ FAST_PATH_NONE_REPEAT | \
+ FAST_PATH_X_UNIT_POSITIVE), \
+ PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA), \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op, \
+ }
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_COVER(op,s,d,func) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, \
+ SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP, \
+ PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA), \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op, \
+ }
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_PAD(op,s,d,func) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, \
+ (SCALED_BILINEAR_FLAGS | \
+ FAST_PATH_PAD_REPEAT | \
+ FAST_PATH_X_UNIT_POSITIVE), \
+ PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA), \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op, \
+ }
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NONE(op,s,d,func) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, \
+ (SCALED_BILINEAR_FLAGS | \
+ FAST_PATH_NONE_REPEAT | \
+ FAST_PATH_X_UNIT_POSITIVE), \
+ PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA), \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op, \
+ }
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_COVER(op,s,d,func) \
+ { PIXMAN_OP_ ## op, \
+ PIXMAN_ ## s, \
+ SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP, \
+ PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA), \
+ PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS, \
+ fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op, \
+ }
+
+/* Prefer the use of 'cover' variant, because it is faster */
+#define SIMPLE_BILINEAR_FAST_PATH(op,s,d,func) \
+ SIMPLE_BILINEAR_FAST_PATH_COVER (op,s,d,func), \
+ SIMPLE_BILINEAR_FAST_PATH_NONE (op,s,d,func), \
+ SIMPLE_BILINEAR_FAST_PATH_PAD (op,s,d,func)
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH(op,s,d,func) \
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH_COVER (op,s,d,func), \
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NONE (op,s,d,func), \
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH_PAD (op,s,d,func)
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH(op,s,d,func) \
+ SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_COVER (op,s,d,func), \
+ SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NONE (op,s,d,func), \
+ SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_PAD (op,s,d,func)
+
#endif
--
1.7.3.4

View File

@@ -0,0 +1,136 @@
From 0df43b8ae5031dd83775d00b57b6bed809db0e89 Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Mon, 21 Feb 2011 02:07:09 +0200
Subject: [PATCH 08/22] test: check correctness of 'bilinear_pad_repeat_get_scanline_bounds'
Individual correctness check for the new bilinear scaling related
supplementary function. This test program uses a bit wider range
of input arguments, not covered by other tests.
---
test/Makefile.am | 2 +
test/scaling-helpers-test.c | 93 +++++++++++++++++++++++++++++++++++++++++++
2 files changed, 95 insertions(+), 0 deletions(-)
create mode 100644 test/scaling-helpers-test.c
diff --git a/test/Makefile.am b/test/Makefile.am
index 057e9ce..9dc7219 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -13,6 +13,7 @@ TESTPROGRAMS = \
trap-crasher \
alpha-loop \
scaling-crash-test \
+ scaling-helpers-test \
gradient-crash-test \
alphamap \
stress-test \
@@ -33,6 +34,7 @@ alpha_loop_SOURCES = alpha-loop.c utils.c utils.h
composite_SOURCES = composite.c utils.c utils.h
gradient_crash_test_SOURCES = gradient-crash-test.c utils.c utils.h
stress_test_SOURCES = stress-test.c utils.c utils.h
+scaling_helpers_test_SOURCES = scaling-helpers-test.c utils.c utils.h
# Benchmarks
diff --git a/test/scaling-helpers-test.c b/test/scaling-helpers-test.c
new file mode 100644
index 0000000..c186138
--- /dev/null
+++ b/test/scaling-helpers-test.c
@@ -0,0 +1,93 @@
+#include <config.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include "utils.h"
+#include "pixman-fast-path.h"
+
+/* A trivial reference implementation for
+ * 'bilinear_pad_repeat_get_scanline_bounds'
+ */
+static void
+bilinear_pad_repeat_get_scanline_bounds_ref (int32_t source_image_width,
+ pixman_fixed_t vx_,
+ pixman_fixed_t unit_x,
+ int32_t * left_pad,
+ int32_t * left_tz,
+ int32_t * width,
+ int32_t * right_tz,
+ int32_t * right_pad)
+{
+ int w = *width;
+ *left_pad = 0;
+ *left_tz = 0;
+ *width = 0;
+ *right_tz = 0;
+ *right_pad = 0;
+ int64_t vx = vx_;
+ while (--w >= 0)
+ {
+ if (vx < 0)
+ {
+ if (vx + pixman_fixed_1 < 0)
+ *left_pad += 1;
+ else
+ *left_tz += 1;
+ }
+ else if (vx + pixman_fixed_1 >= pixman_int_to_fixed (source_image_width))
+ {
+ if (vx >= pixman_int_to_fixed (source_image_width))
+ *right_pad += 1;
+ else
+ *right_tz += 1;
+ }
+ else
+ {
+ *width += 1;
+ }
+ vx += unit_x;
+ }
+}
+
+int
+main (void)
+{
+ int i;
+ for (i = 0; i < 10000; i++)
+ {
+ int32_t left_pad1, left_tz1, width1, right_tz1, right_pad1;
+ int32_t left_pad2, left_tz2, width2, right_tz2, right_pad2;
+ pixman_fixed_t vx = lcg_rand_N(10000 << 16) - (3000 << 16);
+ int32_t width = lcg_rand_N(10000);
+ int32_t source_image_width = lcg_rand_N(10000) + 1;
+ pixman_fixed_t unit_x = lcg_rand_N(10 << 16) + 1;
+ width1 = width2 = width;
+
+ bilinear_pad_repeat_get_scanline_bounds_ref (source_image_width,
+ vx,
+ unit_x,
+ &left_pad1,
+ &left_tz1,
+ &width1,
+ &right_tz1,
+ &right_pad1);
+
+ bilinear_pad_repeat_get_scanline_bounds (source_image_width,
+ vx,
+ unit_x,
+ &left_pad2,
+ &left_tz2,
+ &width2,
+ &right_tz2,
+ &right_pad2);
+
+ assert (left_pad1 == left_pad2);
+ assert (left_tz1 == left_tz2);
+ assert (width1 == width2);
+ assert (right_tz1 == right_tz2);
+ assert (right_pad1 == right_pad2);
+ }
+
+ return 0;
+}
--
1.7.3.4

View File

@@ -0,0 +1,156 @@
From 350029396d911941591149cc82b5e68a78ad6747 Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Mon, 21 Feb 2011 20:18:02 +0200
Subject: [PATCH 09/22] SSE2 optimization for bilinear scaled 'src_8888_8888'
A primitive naive implementation of bilinear scaling using SSE2 intrinsics,
which only handles one pixel at a time. It is approximately 2x faster than
pixman general compositing path. Single pass processing without intermediate
temporary buffer contributes to ~15% and loop unrolling contributes to ~20%
of this speedup.
Benchmark on Intel Core i7 (x86-64):
Using cairo-perf-trace:
before: image firefox-planet-gnome 12.566 12.610 0.23% 6/6
after: image firefox-planet-gnome 10.961 11.013 0.19% 5/6
Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
before: op=1, src=20028888, dst=20028888, speed=70.48 MPix/s
after: op=1, src=20028888, dst=20028888, speed=165.38 MPix/s
---
pixman/pixman-sse2.c | 112 ++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 112 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index 88287b4..696005f 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -5567,6 +5567,114 @@ FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
scaled_nearest_scanline_sse2_8888_n_8888_OVER,
uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
+static void
+bilinear_interpolate_line_sse2 (uint32_t * out,
+ const uint32_t * top,
+ const uint32_t * bottom,
+ int wt,
+ int wb,
+ pixman_fixed_t x,
+ pixman_fixed_t ux,
+ int width)
+{
+ const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);
+ const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);
+ const __m128i xmm_xorc = _mm_set_epi16 (0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff);
+ const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);
+ const __m128i xmm_ux = _mm_set_epi16 (ux, ux, ux, ux, ux, ux, ux, ux);
+ const __m128i xmm_zero = _mm_setzero_si128 ();
+ __m128i xmm_x = _mm_set_epi16 (x, x, x, x, x, x, x, x);
+ uint32_t pix1, pix2, pix3, pix4;
+
+ #define INTERPOLATE_ONE_PIXEL(pix) \
+ do { \
+ __m128i xmm_wh, xmm_lo, xmm_hi, a; \
+ /* fetch 2x2 pixel block into sse2 register */ \
+ uint32_t tl = top [pixman_fixed_to_int (x)]; \
+ uint32_t tr = top [pixman_fixed_to_int (x) + 1]; \
+ uint32_t bl = bottom [pixman_fixed_to_int (x)]; \
+ uint32_t br = bottom [pixman_fixed_to_int (x) + 1]; \
+ a = _mm_set_epi32 (tr, tl, br, bl); \
+ x += ux; \
+ /* vertical interpolation */ \
+ a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpackhi_epi8 (a, xmm_zero), \
+ xmm_wt), \
+ _mm_mullo_epi16 (_mm_unpacklo_epi8 (a, xmm_zero), \
+ xmm_wb)); \
+ /* calculate horizontal weights */ \
+ xmm_wh = _mm_add_epi16 (xmm_addc, \
+ _mm_xor_si128 (xmm_xorc, \
+ _mm_srli_epi16 (xmm_x, 8))); \
+ xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \
+ /* horizontal interpolation */ \
+ xmm_lo = _mm_mullo_epi16 (a, xmm_wh); \
+ xmm_hi = _mm_mulhi_epu16 (a, xmm_wh); \
+ a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi), \
+ _mm_unpackhi_epi16 (xmm_lo, xmm_hi)); \
+ /* shift and pack the result */ \
+ a = _mm_srli_epi32 (a, 16); \
+ a = _mm_packs_epi32 (a, a); \
+ a = _mm_packus_epi16 (a, a); \
+ pix = _mm_cvtsi128_si32 (a); \
+ } while (0)
+
+ while ((width -= 4) >= 0)
+ {
+ INTERPOLATE_ONE_PIXEL (pix1);
+ INTERPOLATE_ONE_PIXEL (pix2);
+ INTERPOLATE_ONE_PIXEL (pix3);
+ INTERPOLATE_ONE_PIXEL (pix4);
+ *out++ = pix1;
+ *out++ = pix2;
+ *out++ = pix3;
+ *out++ = pix4;
+ }
+ if (width & 2)
+ {
+ INTERPOLATE_ONE_PIXEL (pix1);
+ INTERPOLATE_ONE_PIXEL (pix2);
+ *out++ = pix1;
+ *out++ = pix2;
+ }
+ if (width & 1)
+ {
+ INTERPOLATE_ONE_PIXEL (pix1);
+ *out = pix1;
+ }
+
+ #undef INTERPOLATE_ONE_PIXEL
+}
+
+static force_inline void
+scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst,
+ const uint32_t * mask,
+ const uint32_t * src_top,
+ const uint32_t * src_bottom,
+ int32_t w,
+ int wt,
+ int wb,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ pixman_fixed_t max_vx,
+ pixman_bool_t zero_src)
+{
+ bilinear_interpolate_line_sse2 (dst, src_top, src_bottom,
+ wt, wb, vx, unit_x, w);
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
+ scaled_bilinear_scanline_sse2_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ COVER, FALSE, FALSE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
+ scaled_bilinear_scanline_sse2_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ PAD, FALSE, FALSE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
+ scaled_bilinear_scanline_sse2_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ NONE, FALSE, FALSE)
+
static const pixman_fast_path_t sse2_fast_paths[] =
{
/* PIXMAN_OP_OVER */
@@ -5668,6 +5776,10 @@ static const pixman_fast_path_t sse2_fast_paths[] =
SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
+
{ PIXMAN_OP_NONE },
};
--
1.7.3.4

View File

@@ -0,0 +1,288 @@
From 17feaa9c50bb8521b0366345efe181bd99754957 Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Tue, 22 Feb 2011 18:45:03 +0200
Subject: [PATCH 10/22] ARM: NEON optimization for bilinear scaled 'src_8888_8888'
Initial NEON optimization for bilinear scaling. Can be probably
improved more.
Benchmark on ARM Cortex-A8:
Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
before: op=1, src=20028888, dst=20028888, speed=6.70 MPix/s
after: op=1, src=20028888, dst=20028888, speed=44.27 MPix/s
---
pixman/pixman-arm-neon-asm.S | 197 ++++++++++++++++++++++++++++++++++++++++++
pixman/pixman-arm-neon.c | 45 ++++++++++
2 files changed, 242 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 47daf45..c168e10 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -2391,3 +2391,200 @@ generate_composite_function_nearest_scanline \
10, /* dst_r_basereg */ \
8, /* src_basereg */ \
15 /* mask_basereg */
+
+/******************************************************************************/
+
+/* Supplementary macro for setting function attributes */
+.macro pixman_asm_function fname
+ .func fname
+ .global fname
+#ifdef __ELF__
+ .hidden fname
+ .type fname, %function
+#endif
+fname:
+.endm
+
+.macro bilinear_interpolate_last_pixel
+ mov TMP1, X, asr #16
+ mov TMP2, X, asr #16
+ add TMP1, TOP, TMP1, asl #2
+ add TMP2, BOTTOM, TMP2, asl #2
+ vld1.32 {d0}, [TMP1]
+ vshr.u16 d30, d24, #8
+ vld1.32 {d1}, [TMP2]
+ vmull.u8 q1, d0, d28
+ vmlal.u8 q1, d1, d29
+ /* 5 cycles bubble */
+ vshll.u16 q0, d2, #8
+ vmlsl.u16 q0, d2, d30
+ vmlal.u16 q0, d3, d30
+ /* 5 cycles bubble */
+ vshrn.u32 d0, q0, #16
+ /* 3 cycles bubble */
+ vmovn.u16 d0, q0
+ /* 1 cycle bubble */
+ vst1.32 {d0[0]}, [OUT, :32]!
+.endm
+
+.macro bilinear_interpolate_two_pixels
+ mov TMP1, X, asr #16
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ add TMP2, BOTTOM, TMP2, asl #2
+ vld1.32 {d0}, [TMP1]
+ vld1.32 {d1}, [TMP2]
+ vmull.u8 q1, d0, d28
+ vmlal.u8 q1, d1, d29
+ mov TMP1, X, asr #16
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ add TMP2, BOTTOM, TMP2, asl #2
+ vld1.32 {d20}, [TMP1]
+ vld1.32 {d21}, [TMP2]
+ vmull.u8 q11, d20, d28
+ vmlal.u8 q11, d21, d29
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
+ vshll.u16 q0, d2, #8
+ vmlsl.u16 q0, d2, d30
+ vmlal.u16 q0, d3, d30
+ vshll.u16 q10, d22, #8
+ vmlsl.u16 q10, d22, d31
+ vmlal.u16 q10, d23, d31
+ vshrn.u32 d30, q0, #16
+ vshrn.u32 d31, q10, #16
+ vmovn.u16 d0, q15
+ vst1.32 {d0}, [OUT]!
+.endm
+
+.macro bilinear_interpolate_four_pixels
+ mov TMP1, X, asr #16
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ add TMP2, BOTTOM, TMP2, asl #2
+ vld1.32 {d0}, [TMP1]
+ vld1.32 {d1}, [TMP2]
+ vmull.u8 q1, d0, d28
+ vmlal.u8 q1, d1, d29
+ mov TMP1, X, asr #16
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ add TMP2, BOTTOM, TMP2, asl #2
+ vld1.32 {d20}, [TMP1]
+ vld1.32 {d21}, [TMP2]
+ vmull.u8 q11, d20, d28
+ vmlal.u8 q11, d21, d29
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
+ vshll.u16 q0, d2, #8
+ vmlsl.u16 q0, d2, d30
+ vmlal.u16 q0, d3, d30
+ vshll.u16 q10, d22, #8
+ vmlsl.u16 q10, d22, d31
+ vmlal.u16 q10, d23, d31
+ mov TMP1, X, asr #16
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ add TMP2, BOTTOM, TMP2, asl #2
+ vld1.32 {d4}, [TMP1]
+ vld1.32 {d5}, [TMP2]
+ vmull.u8 q3, d4, d28
+ vmlal.u8 q3, d5, d29
+ mov TMP1, X, asr #16
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ add TMP2, BOTTOM, TMP2, asl #2
+ vld1.32 {d16}, [TMP1]
+ vld1.32 {d17}, [TMP2]
+ vmull.u8 q9, d16, d28
+ vmlal.u8 q9, d17, d29
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
+ vshll.u16 q2, d6, #8
+ vmlsl.u16 q2, d6, d30
+ vmlal.u16 q2, d7, d30
+ vshll.u16 q8, d18, #8
+ vmlsl.u16 q8, d18, d31
+ vmlal.u16 q8, d19, d31
+ vshrn.u32 d0, q0, #16
+ vshrn.u32 d1, q10, #16
+ vshrn.u32 d4, q2, #16
+ vshrn.u32 d5, q8, #16
+ vmovn.u16 d0, q0
+ vmovn.u16 d1, q2
+ vst1.32 {d0, d1}, [OUT]!
+.endm
+
+
+/*
+ * pixman_scaled_bilinear_scanline_8888_8888_SRC (uint32_t * out,
+ * const uint32_t * top,
+ * const uint32_t * bottom,
+ * int wt,
+ * int wb,
+ * pixman_fixed_t x,
+ * pixman_fixed_t ux,
+ * int width)
+ */
+
+pixman_asm_function pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon
+ OUT .req r0
+ TOP .req r1
+ BOTTOM .req r2
+ WT .req r3
+ WB .req r4
+ X .req r5
+ UX .req r6
+ WIDTH .req ip
+ TMP1 .req r3
+ TMP2 .req r4
+
+ mov ip, sp
+ push {r4, r5, r6, r7}
+ ldmia ip, {WB, X, UX, WIDTH}
+
+ cmp WIDTH, #0
+ ble 3f
+ vdup.u16 q12, X
+ vdup.u16 q13, UX
+ vdup.u8 d28, WT
+ vdup.u8 d29, WB
+ vadd.u16 d25, d25, d26
+ vadd.u16 q13, q13, q13
+
+ subs WIDTH, WIDTH, #4
+ blt 1f
+0:
+ bilinear_interpolate_four_pixels
+ subs WIDTH, WIDTH, #4
+ bge 0b
+1:
+ tst WIDTH, #2
+ beq 2f
+ bilinear_interpolate_two_pixels
+2:
+ tst WIDTH, #1
+ beq 3f
+ bilinear_interpolate_last_pixel
+3:
+ pop {r4, r5, r6, r7}
+ bx lr
+
+ .unreq OUT
+ .unreq TOP
+ .unreq BOTTOM
+ .unreq WT
+ .unreq WB
+ .unreq X
+ .unreq UX
+ .unreq WIDTH
+ .unreq TMP1
+ .unreq TMP2
+.endfunc
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 3e0c0d1..c7c0254 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -232,6 +232,47 @@ pixman_blt_neon (uint32_t *src_bits,
}
}
+void
+pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon (uint32_t * out,
+ const uint32_t * top,
+ const uint32_t * bottom,
+ int wt,
+ int wb,
+ pixman_fixed_t x,
+ pixman_fixed_t ux,
+ int width);
+
+static force_inline void
+scaled_bilinear_scanline_neon_8888_8888_SRC (uint32_t * dst,
+ const uint32_t * mask,
+ const uint32_t * src_top,
+ const uint32_t * src_bottom,
+ int32_t w,
+ int wt,
+ int wb,
+ pixman_fixed_t vx,
+ pixman_fixed_t unit_x,
+ pixman_fixed_t max_vx,
+ pixman_bool_t zero_src)
+{
+ pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon (dst, src_top,
+ src_bottom, wt, wb,
+ vx, unit_x, w);
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_8888_cover_SRC,
+ scaled_bilinear_scanline_neon_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ COVER, FALSE, FALSE)
+FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_8888_pad_SRC,
+ scaled_bilinear_scanline_neon_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ PAD, FALSE, FALSE)
+FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_8888_none_SRC,
+ scaled_bilinear_scanline_neon_8888_8888_SRC,
+ uint32_t, uint32_t, uint32_t,
+ NONE, FALSE, FALSE)
+
static const pixman_fast_path_t arm_neon_fast_paths[] =
{
PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, neon_composite_src_0565_0565),
@@ -343,6 +384,10 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, r5g6b5, r5g6b5, neon_0565_8_0565),
PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, b5g6r5, b5g6r5, neon_0565_8_0565),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, neon_8888_8888),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, neon_8888_8888),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, neon_8888_8888),
+
{ PIXMAN_OP_NONE },
};
--
1.7.3.4

View File

@@ -0,0 +1,156 @@
From 84f3c5a71a2de1a96dcf0c7f9ab0a8ee1b1b158f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=B8ren=20Sandmann=20Pedersen?= <ssp@redhat.com>
Date: Mon, 7 Mar 2011 13:45:54 -0500
Subject: [PATCH 11/22] test: In image_endian_swap() use pixman_image_get_format() to get the bpp.
There is no reason to pass in the bpp as an argument; it can be gotten
directly from the image.
---
test/affine-test.c | 6 +++---
test/blitters-test.c | 4 ++--
test/composite-traps-test.c | 2 +-
test/scaling-test.c | 6 +++---
test/utils.c | 9 +++++++--
test/utils.h | 2 +-
6 files changed, 17 insertions(+), 12 deletions(-)
diff --git a/test/affine-test.c b/test/affine-test.c
index b7a1fa6..ed8000c 100644
--- a/test/affine-test.c
+++ b/test/affine-test.c
@@ -95,8 +95,8 @@ test_composite (int testnum,
dst_img = pixman_image_create_bits (
dst_fmt, dst_width, dst_height, dstbuf, dst_stride);
- image_endian_swap (src_img, src_bpp * 8);
- image_endian_swap (dst_img, dst_bpp * 8);
+ image_endian_swap (src_img);
+ image_endian_swap (dst_img);
pixman_transform_init_identity (&transform);
@@ -251,7 +251,7 @@ test_composite (int testnum,
dstbuf[i] &= 0xFFFFFF;
}
- image_endian_swap (dst_img, dst_bpp * 8);
+ image_endian_swap (dst_img);
if (verbose)
{
diff --git a/test/blitters-test.c b/test/blitters-test.c
index 42181ef..63e7cb3 100644
--- a/test/blitters-test.c
+++ b/test/blitters-test.c
@@ -61,7 +61,7 @@ create_random_image (pixman_format_code_t *allowed_formats,
pixman_image_set_indexed (img, &(y_palette[PIXMAN_FORMAT_BPP (fmt)]));
}
- image_endian_swap (img, PIXMAN_FORMAT_BPP (fmt));
+ image_endian_swap (img);
if (used_fmt) *used_fmt = fmt;
return img;
@@ -101,7 +101,7 @@ free_random_image (uint32_t initcrc,
/* swap endiannes in order to provide identical results on both big
* and litte endian systems
*/
- image_endian_swap (img, PIXMAN_FORMAT_BPP (fmt));
+ image_endian_swap (img);
crc32 = compute_crc32 (initcrc, data, stride * height);
}
diff --git a/test/composite-traps-test.c b/test/composite-traps-test.c
index 8f32778..298537d 100644
--- a/test/composite-traps-test.c
+++ b/test/composite-traps-test.c
@@ -218,7 +218,7 @@ test_composite (int testnum,
dst_bits[i] &= 0xFFFFFF;
}
- image_endian_swap (dst_img, dst_bpp * 8);
+ image_endian_swap (dst_img);
if (verbose)
{
diff --git a/test/scaling-test.c b/test/scaling-test.c
index dbb9d39..82370f7 100644
--- a/test/scaling-test.c
+++ b/test/scaling-test.c
@@ -140,8 +140,8 @@ test_composite (int testnum,
dst_img = pixman_image_create_bits (
dst_fmt, dst_width, dst_height, dstbuf, dst_stride);
- image_endian_swap (src_img, src_bpp * 8);
- image_endian_swap (dst_img, dst_bpp * 8);
+ image_endian_swap (src_img);
+ image_endian_swap (dst_img);
if (lcg_rand_n (4) > 0)
{
@@ -330,7 +330,7 @@ test_composite (int testnum,
dstbuf[i] &= 0xFFFFFF;
}
- image_endian_swap (dst_img, dst_bpp * 8);
+ image_endian_swap (dst_img);
if (verbose)
{
diff --git a/test/utils.c b/test/utils.c
index 2f21398..4bf02e1 100644
--- a/test/utils.c
+++ b/test/utils.c
@@ -133,11 +133,12 @@ compute_crc32 (uint32_t in_crc32,
/* perform endian conversion of pixel data
*/
void
-image_endian_swap (pixman_image_t *img, int bpp)
+image_endian_swap (pixman_image_t *img)
{
int stride = pixman_image_get_stride (img);
uint32_t *data = pixman_image_get_data (img);
int height = pixman_image_get_height (img);
+ int bpp = PIXMAN_FORMAT_BPP (pixman_image_get_format (img));
int i, j;
/* swap bytes only on big endian systems */
@@ -145,10 +146,13 @@ image_endian_swap (pixman_image_t *img, int bpp)
if (*(volatile uint8_t *)&endian_check_var != 0x12)
return;
+ if (bpp == 8)
+ return;
+
for (i = 0; i < height; i++)
{
uint8_t *line_data = (uint8_t *)data + stride * i;
- /* swap bytes only for 16, 24 and 32 bpp for now */
+
switch (bpp)
{
case 1:
@@ -208,6 +212,7 @@ image_endian_swap (pixman_image_t *img, int bpp)
}
break;
default:
+ assert (FALSE);
break;
}
}
diff --git a/test/utils.h b/test/utils.h
index 9c7bdb1..a5183f7 100644
--- a/test/utils.h
+++ b/test/utils.h
@@ -60,7 +60,7 @@ compute_crc32 (uint32_t in_crc32,
/* perform endian conversion of pixel data
*/
void
-image_endian_swap (pixman_image_t *img, int bpp);
+image_endian_swap (pixman_image_t *img);
/* Allocate memory that is bounded by protected pages,
* so that out-of-bounds access will cause segfaults
--
1.7.3.4

View File

@@ -0,0 +1,36 @@
From 84e361c8e357e26f299213fbeefe64c73447b116 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=B8ren=20Sandmann=20Pedersen?= <ssp@redhat.com>
Date: Fri, 4 Mar 2011 15:51:18 -0500
Subject: [PATCH 12/22] test: Do endian swapping of the source and destination images.
Otherwise the test fails on big endian. Fix for bug 34767, reported by
Siarhei Siamashka.
---
test/composite-traps-test.c | 4 ++++
1 files changed, 4 insertions(+), 0 deletions(-)
diff --git a/test/composite-traps-test.c b/test/composite-traps-test.c
index 298537d..cf30281 100644
--- a/test/composite-traps-test.c
+++ b/test/composite-traps-test.c
@@ -139,6 +139,8 @@ test_composite (int testnum,
pixman_image_set_source_clipping (src_img, 1);
pixman_region_fini (&clip);
}
+
+ image_endian_swap (src_img);
}
/* Create destination image */
@@ -157,6 +159,8 @@ test_composite (int testnum,
dst_img = pixman_image_create_bits (
dst_format, dst_width, dst_height, dst_bits, dst_stride);
+
+ image_endian_swap (dst_img);
}
/* Create traps */
--
1.7.3.4

View File

@@ -0,0 +1,77 @@
From bb3d1b67fd0f42ae00af811c624ea1c44541034d Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Sun, 6 Mar 2011 16:17:12 +0200
Subject: [PATCH 13/22] ARM: use prefetch in nearest scaled 'src_0565_0565'
Benchmark on ARM Cortex-A8 r1p3 @500MHz, 32-bit LPDDR @166MHz:
Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
before: op=1, src=10020565, dst=10020565, speed=75.02 MPix/s
after: op=1, src=10020565, dst=10020565, speed=73.63 MPix/s
Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz:
Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
before: op=1, src=10020565, dst=10020565, speed=176.12 MPix/s
after: op=1, src=10020565, dst=10020565, speed=267.50 MPix/s
---
pixman/pixman-arm-simd-asm.S | 27 +++++++++++++++++++++++++--
1 files changed, 25 insertions(+), 2 deletions(-)
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index 7567700..dd1366d 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -348,6 +348,7 @@ pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6
TMP1 .req r4
TMP2 .req r5
VXMASK .req r6
+ PF_OFFS .req r7
ldr UNIT_X, [sp]
push {r4, r5, r6, r7}
@@ -366,12 +367,33 @@ pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6
strh TMP2, [DST], #2
.endm
+ /*
+ * stop prefetch before reaching the end of scanline (a good behaving
+ * value selected based on some benchmarks with short scanlines)
+ */
+ #define PREFETCH_BRAKING_DISTANCE 32
+
/* now do the scaling */
and TMP1, VXMASK, VX, lsr #15
add VX, VX, UNIT_X
- subs W, #4
+ subs W, #(8 + PREFETCH_BRAKING_DISTANCE)
+ blt 2f
+ /* set prefetch distance to 80 pixels ahead */
+ add PF_OFFS, VX, UNIT_X, lsl #6
+ add PF_OFFS, PF_OFFS, UNIT_X, lsl #4
+1: /* main loop, process 8 pixels per iteration with prefetch */
+ subs W, W, #8
+ add PF_OFFS, UNIT_X, lsl #3
+ scale_2_pixels
+ scale_2_pixels
+ scale_2_pixels
+ scale_2_pixels
+ pld [SRC, PF_OFFS, lsr #15]
+ bge 1b
+2:
+ subs W, #(4 - 8 - PREFETCH_BRAKING_DISTANCE)
blt 2f
-1: /* main loop, process 4 pixels per iteration */
+1: /* process the remaining pixels */
scale_2_pixels
scale_2_pixels
subs W, W, #4
@@ -394,6 +416,7 @@ pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6
.unreq TMP1
.unreq TMP2
.unreq VXMASK
+ .unreq PF_OFFS
/* return */
pop {r4, r5, r6, r7}
bx lr
--
1.7.3.4

View File

@@ -0,0 +1,131 @@
From f3e17872f5522e25da8e32de83e62bee8cc198d7 Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Mon, 7 Mar 2011 03:10:43 +0200
Subject: [PATCH 14/22] ARM: common macro for nearest scaling fast paths
The code of nearest scaled 'src_0565_0565' function was generalized
and moved to a common macro, so that it can be reused for other
fast paths.
---
pixman/pixman-arm-simd-asm.S | 60 +++++++++++++++++++++++++----------------
1 files changed, 36 insertions(+), 24 deletions(-)
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index dd1366d..a9775e2 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -331,15 +331,29 @@ pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6
.endfunc
/*
- * Note: This function is only using armv4t instructions (not even armv6),
+ * Note: This code is only using armv5te instructions (not even armv6),
* but is scheduled for ARM Cortex-A8 pipeline. So it might need to
* be split into a few variants, tuned for each microarchitecture.
*
* TODO: In order to get good performance on ARM9/ARM11 cores (which don't
* have efficient write combining), it needs to be changed to use 16-byte
* aligned writes using STM instruction.
+ *
+ * Nearest scanline scaler macro template uses the following arguments:
+ * fname - name of the function to generate
+ * bpp_shift - (1 << bpp_shift) is the size of pixel in bytes
+ * t - type suffix for LDR/STR instructions
+ * prefetch_distance - prefetch in the source image by that many
+ * pixels ahead
+ * prefetch_braking_distance - stop prefetching when that many pixels are
+ * remaining before the end of scanline
*/
-pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6
+
+.macro generate_nearest_scanline_func fname, bpp_shift, t, \
+ prefetch_distance, \
+ prefetch_braking_distance
+
+pixman_asm_function fname
W .req r0
DST .req r1
SRC .req r2
@@ -352,35 +366,29 @@ pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6
ldr UNIT_X, [sp]
push {r4, r5, r6, r7}
- mvn VXMASK, #1
+ mvn VXMASK, #((1 << bpp_shift) - 1)
/* define helper macro */
.macro scale_2_pixels
- ldrh TMP1, [SRC, TMP1]
- and TMP2, VXMASK, VX, lsr #15
+ ldr&t TMP1, [SRC, TMP1]
+ and TMP2, VXMASK, VX, lsr #(16 - bpp_shift)
add VX, VX, UNIT_X
- strh TMP1, [DST], #2
+ str&t TMP1, [DST], #(1 << bpp_shift)
- ldrh TMP2, [SRC, TMP2]
- and TMP1, VXMASK, VX, lsr #15
+ ldr&t TMP2, [SRC, TMP2]
+ and TMP1, VXMASK, VX, lsr #(16 - bpp_shift)
add VX, VX, UNIT_X
- strh TMP2, [DST], #2
+ str&t TMP2, [DST], #(1 << bpp_shift)
.endm
- /*
- * stop prefetch before reaching the end of scanline (a good behaving
- * value selected based on some benchmarks with short scanlines)
- */
- #define PREFETCH_BRAKING_DISTANCE 32
-
/* now do the scaling */
- and TMP1, VXMASK, VX, lsr #15
+ and TMP1, VXMASK, VX, lsr #(16 - bpp_shift)
add VX, VX, UNIT_X
- subs W, #(8 + PREFETCH_BRAKING_DISTANCE)
+ subs W, W, #(8 + prefetch_braking_distance)
blt 2f
- /* set prefetch distance to 80 pixels ahead */
- add PF_OFFS, VX, UNIT_X, lsl #6
- add PF_OFFS, PF_OFFS, UNIT_X, lsl #4
+ /* calculate prefetch offset */
+ mov PF_OFFS, #prefetch_distance
+ mla PF_OFFS, UNIT_X, PF_OFFS, VX
1: /* main loop, process 8 pixels per iteration with prefetch */
subs W, W, #8
add PF_OFFS, UNIT_X, lsl #3
@@ -388,10 +396,10 @@ pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6
scale_2_pixels
scale_2_pixels
scale_2_pixels
- pld [SRC, PF_OFFS, lsr #15]
+ pld [SRC, PF_OFFS, lsr #(16 - bpp_shift)]
bge 1b
2:
- subs W, #(4 - 8 - PREFETCH_BRAKING_DISTANCE)
+ subs W, W, #(4 - 8 - prefetch_braking_distance)
blt 2f
1: /* process the remaining pixels */
scale_2_pixels
@@ -404,8 +412,8 @@ pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6
scale_2_pixels
2:
tst W, #1
- ldrneh TMP1, [SRC, TMP1]
- strneh TMP1, [DST], #2
+ ldrne&t TMP1, [SRC, TMP1]
+ strne&t TMP1, [DST]
/* cleanup helper macro */
.purgem scale_2_pixels
.unreq DST
@@ -421,3 +429,7 @@ pixman_asm_function pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6
pop {r4, r5, r6, r7}
bx lr
.endfunc
+.endm
+
+generate_nearest_scanline_func \
+ pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32
--
1.7.3.4

View File

@@ -0,0 +1,60 @@
From 5921c17639fe5fdc595c850e3347281c1c8746ba Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Sun, 6 Mar 2011 22:16:32 +0200
Subject: [PATCH 15/22] ARM: assembly optimized nearest scaled 'src_8888_8888'
Benchmark on ARM Cortex-A8 r1p3 @500MHz, 32-bit LPDDR @166MHz:
Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
before: op=1, src=20028888, dst=20028888, speed=44.36 MPix/s
after: op=1, src=20028888, dst=20028888, speed=39.79 MPix/s
Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz:
Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
before: op=1, src=20028888, dst=20028888, speed=102.36 MPix/s
after: op=1, src=20028888, dst=20028888, speed=163.12 MPix/s
---
pixman/pixman-arm-simd-asm.S | 3 +++
pixman/pixman-arm-simd.c | 9 +++++++++
2 files changed, 12 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index a9775e2..858c690 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -433,3 +433,6 @@ pixman_asm_function fname
generate_nearest_scanline_func \
pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32
+
+generate_nearest_scanline_func \
+ pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2, , 48, 32
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index 6bbc109..a66f8df 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -389,6 +389,8 @@ PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8_8888,
PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC,
uint16_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC,
+ uint32_t, uint32_t)
static const pixman_fast_path_t arm_simd_fast_paths[] =
{
@@ -411,6 +413,13 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, armv6_0565_0565),
PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, b5g6r5, armv6_0565_0565),
+ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, armv6_8888_8888),
+ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, armv6_8888_8888),
+ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, armv6_8888_8888),
+ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, armv6_8888_8888),
+ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, armv6_8888_8888),
+ PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, armv6_8888_8888),
+
{ PIXMAN_OP_NONE },
};
--
1.7.3.4

View File

@@ -0,0 +1,130 @@
From 66f4ee1b3bccf4516433d61dbf2035551a712fa2 Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Wed, 9 Mar 2011 10:59:46 +0200
Subject: [PATCH 16/22] ARM: new bilinear fast path template macro in 'pixman-arm-common.h'
It can be reused in different ARM NEON bilinear scaling fast path functions.
---
pixman/pixman-arm-common.h | 45 ++++++++++++++++++++++++++++++++++++++++++++
pixman/pixman-arm-neon.c | 44 ++----------------------------------------
2 files changed, 48 insertions(+), 41 deletions(-)
diff --git a/pixman/pixman-arm-common.h b/pixman/pixman-arm-common.h
index 9b1322b..c3bf986 100644
--- a/pixman/pixman-arm-common.h
+++ b/pixman/pixman-arm-common.h
@@ -361,4 +361,49 @@ FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_pad_##op, \
SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func), \
SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func)
+/*****************************************************************************/
+
+#define PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST(flags, cputype, name, op, \
+ src_type, dst_type) \
+void \
+pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype ( \
+ dst_type * dst, \
+ const src_type * top, \
+ const src_type * bottom, \
+ int wt, \
+ int wb, \
+ pixman_fixed_t x, \
+ pixman_fixed_t ux, \
+ int width); \
+ \
+static force_inline void \
+scaled_bilinear_scanline_##cputype##_##name##_##op ( \
+ dst_type * dst, \
+ const uint32_t * mask, \
+ const src_type * src_top, \
+ const src_type * src_bottom, \
+ int32_t w, \
+ int wt, \
+ int wb, \
+ pixman_fixed_t vx, \
+ pixman_fixed_t unit_x, \
+ pixman_fixed_t max_vx, \
+ pixman_bool_t zero_src) \
+{ \
+ if ((flags & SKIP_ZERO_SRC) && zero_src) \
+ return; \
+ pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype ( \
+ dst, src_top, src_bottom, wt, wb, vx, unit_x, w); \
+} \
+ \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_cover_##op, \
+ scaled_bilinear_scanline_##cputype##_##name##_##op, \
+ src_type, uint32_t, dst_type, COVER, FALSE, FALSE) \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_none_##op, \
+ scaled_bilinear_scanline_##cputype##_##name##_##op, \
+ src_type, uint32_t, dst_type, NONE, FALSE, FALSE) \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_pad_##op, \
+ scaled_bilinear_scanline_##cputype##_##name##_##op, \
+ src_type, uint32_t, dst_type, PAD, FALSE, FALSE)
+
#endif
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index c7c0254..98ad5f2 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -127,6 +127,9 @@ PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, neon, 8888_8_0565,
PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, neon, 0565_8_0565,
OVER, uint16_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_8888, SRC,
+ uint32_t, uint32_t)
+
void
pixman_composite_src_n_8_asm_neon (int32_t w,
int32_t h,
@@ -232,47 +235,6 @@ pixman_blt_neon (uint32_t *src_bits,
}
}
-void
-pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon (uint32_t * out,
- const uint32_t * top,
- const uint32_t * bottom,
- int wt,
- int wb,
- pixman_fixed_t x,
- pixman_fixed_t ux,
- int width);
-
-static force_inline void
-scaled_bilinear_scanline_neon_8888_8888_SRC (uint32_t * dst,
- const uint32_t * mask,
- const uint32_t * src_top,
- const uint32_t * src_bottom,
- int32_t w,
- int wt,
- int wb,
- pixman_fixed_t vx,
- pixman_fixed_t unit_x,
- pixman_fixed_t max_vx,
- pixman_bool_t zero_src)
-{
- pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon (dst, src_top,
- src_bottom, wt, wb,
- vx, unit_x, w);
-}
-
-FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_8888_cover_SRC,
- scaled_bilinear_scanline_neon_8888_8888_SRC,
- uint32_t, uint32_t, uint32_t,
- COVER, FALSE, FALSE)
-FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_8888_pad_SRC,
- scaled_bilinear_scanline_neon_8888_8888_SRC,
- uint32_t, uint32_t, uint32_t,
- PAD, FALSE, FALSE)
-FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_8888_none_SRC,
- scaled_bilinear_scanline_neon_8888_8888_SRC,
- uint32_t, uint32_t, uint32_t,
- NONE, FALSE, FALSE)
-
static const pixman_fast_path_t arm_neon_fast_paths[] =
{
PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, neon_composite_src_0565_0565),
--
1.7.3.4

View File

@@ -0,0 +1,271 @@
From 34098dba6763afd3636a14f9c2a079ab08f23b2d Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Wed, 9 Mar 2011 11:34:15 +0200
Subject: [PATCH 17/22] ARM: NEON: common macro template for bilinear scanline scalers
This allows to generate bilinear scanline scaling functions targeting
various source and destination color formats. Right now a8r8g8b8/x8r8g8b8
and r5g6b5 color formats are supported. More formats can be added if needed.
---
pixman/pixman-arm-neon-asm.S | 222 ++++++++++++++++++++++++++++++++++++++++++
pixman/pixman-arm-neon-asm.h | 17 +++
2 files changed, 239 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index c168e10..f3784f5 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -2588,3 +2588,225 @@ pixman_asm_function pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon
.unreq TMP1
.unreq TMP2
.endfunc
+
+.purgem bilinear_interpolate_last_pixel
+.purgem bilinear_interpolate_two_pixels
+.purgem bilinear_interpolate_four_pixels
+
+/*
+ * Bilinear scaling support code which tries to provide pixel fetching, color
+ * format conversion, and interpolation as separate macros which can be used
+ * as the basic building blocks for constructing bilinear scanline functions.
+ */
+
+.macro bilinear_load_8888 reg1, reg2, tmp
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP2, asl #2
+ add TMP2, BOTTOM, TMP2, asl #2
+ vld1.32 {reg1}, [TMP1]
+ vld1.32 {reg2}, [TMP2]
+.endm
+
+.macro bilinear_load_0565 reg1, reg2, tmp
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP2, asl #1
+ add TMP2, BOTTOM, TMP2, asl #1
+ vld1.32 {reg2[0]}, [TMP1]
+ vld1.32 {reg2[1]}, [TMP2]
+ convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
+.endm
+
+.macro bilinear_store_8888 numpix, tmp1, tmp2
+.if numpix == 4
+ vst1.32 {d0, d1}, [OUT]!
+.elseif numpix == 2
+ vst1.32 {d0}, [OUT]!
+.elseif numpix == 1
+ vst1.32 {d0[0]}, [OUT, :32]!
+.else
+ .error bilinear_store_8888 numpix is unsupported
+.endif
+.endm
+
+.macro bilinear_store_0565 numpix, tmp1, tmp2
+ vuzp.u8 d0, d1
+ vuzp.u8 d2, d3
+ vuzp.u8 d1, d3
+ vuzp.u8 d0, d2
+ convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
+.if numpix == 4
+ vst1.16 {d2}, [OUT]!
+.elseif numpix == 2
+ vst1.32 {d2[0]}, [OUT]!
+.elseif numpix == 1
+ vst1.16 {d2[0]}, [OUT]!
+.else
+ .error bilinear_store_0565 numpix is unsupported
+.endif
+.endm
+
+.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
+ bilinear_load_&src_fmt d0, d1, d2
+ vmull.u8 q1, d0, d28
+ vmlal.u8 q1, d1, d29
+ vshr.u16 d30, d24, #8
+ /* 4 cycles bubble */
+ vshll.u16 q0, d2, #8
+ vmlsl.u16 q0, d2, d30
+ vmlal.u16 q0, d3, d30
+ /* 5 cycles bubble */
+ vshrn.u32 d0, q0, #16
+ /* 3 cycles bubble */
+ vmovn.u16 d0, q0
+ /* 1 cycle bubble */
+ bilinear_store_&dst_fmt 1, q2, q3
+.endm
+
+.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
+ bilinear_load_&src_fmt d0, d1, d2
+ vmull.u8 q1, d0, d28
+ vmlal.u8 q1, d1, d29
+ bilinear_load_&src_fmt d20, d21, d22
+ vmull.u8 q11, d20, d28
+ vmlal.u8 q11, d21, d29
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
+ vshll.u16 q0, d2, #8
+ vmlsl.u16 q0, d2, d30
+ vmlal.u16 q0, d3, d30
+ vshll.u16 q10, d22, #8
+ vmlsl.u16 q10, d22, d31
+ vmlal.u16 q10, d23, d31
+ vshrn.u32 d30, q0, #16
+ vshrn.u32 d31, q10, #16
+ vmovn.u16 d0, q15
+ bilinear_store_&dst_fmt 2, q2, q3
+.endm
+
+.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
+ bilinear_load_&src_fmt d0, d1, d2
+ vmull.u8 q1, d0, d28
+ vmlal.u8 q1, d1, d29
+ bilinear_load_&src_fmt d20, d21, d22
+ vmull.u8 q11, d20, d28
+ vmlal.u8 q11, d21, d29
+ bilinear_load_&src_fmt d4, d5, d6
+ vmull.u8 q3, d4, d28
+ vmlal.u8 q3, d5, d29
+ bilinear_load_&src_fmt d16, d17, d18
+ vmull.u8 q9, d16, d28
+ vmlal.u8 q9, d17, d29
+ pld [TMP1, PF_OFFS]
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
+ vshll.u16 q0, d2, #8
+ vmlsl.u16 q0, d2, d30
+ vmlal.u16 q0, d3, d30
+ vshll.u16 q10, d22, #8
+ vmlsl.u16 q10, d22, d31
+ vmlal.u16 q10, d23, d31
+ vshr.u16 q15, q12, #8
+ vshll.u16 q2, d6, #8
+ vmlsl.u16 q2, d6, d30
+ vmlal.u16 q2, d7, d30
+ vshll.u16 q8, d18, #8
+ pld [TMP2, PF_OFFS]
+ vmlsl.u16 q8, d18, d31
+ vmlal.u16 q8, d19, d31
+ vadd.u16 q12, q12, q13
+ vshrn.u32 d0, q0, #16
+ vshrn.u32 d1, q10, #16
+ vshrn.u32 d4, q2, #16
+ vshrn.u32 d5, q8, #16
+ vmovn.u16 d0, q0
+ vmovn.u16 d1, q2
+ bilinear_store_&dst_fmt 4, q2, q3
+.endm
+
+/*
+ * Main template macro for generating NEON optimized bilinear scanline
+ * functions.
+ *
+ * TODO: use software pipelining and aligned writes to the destination buffer
+ * in order to improve performance
+ *
+ * Bilinear scanline scaler macro template uses the following arguments:
+ * fname - name of the function to generate
+ * src_fmt - source color format (8888 or 0565)
+ * dst_fmt - destination color format (8888 or 0565)
+ * bpp_shift - (1 << bpp_shift) is the size of source pixel in bytes
+ * prefetch_distance - prefetch in the source image by that many
+ * pixels ahead
+ */
+
+.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
+ bpp_shift, prefetch_distance
+
+pixman_asm_function fname
+ OUT .req r0
+ TOP .req r1
+ BOTTOM .req r2
+ WT .req r3
+ WB .req r4
+ X .req r5
+ UX .req r6
+ WIDTH .req ip
+ TMP1 .req r3
+ TMP2 .req r4
+ PF_OFFS .req r7
+ TMP3 .req r8
+ TMP4 .req r9
+
+ mov ip, sp
+ push {r4, r5, r6, r7, r8, r9}
+ mov PF_OFFS, #prefetch_distance
+ ldmia ip, {WB, X, UX, WIDTH}
+ mul PF_OFFS, PF_OFFS, UX
+
+ cmp WIDTH, #0
+ ble 3f
+
+ vdup.u16 q12, X
+ vdup.u16 q13, UX
+ vdup.u8 d28, WT
+ vdup.u8 d29, WB
+ vadd.u16 d25, d25, d26
+ vadd.u16 q13, q13, q13
+
+ subs WIDTH, WIDTH, #4
+ blt 1f
+ mov PF_OFFS, PF_OFFS, asr #(16 - bpp_shift)
+0:
+ bilinear_interpolate_four_pixels src_fmt, dst_fmt
+ subs WIDTH, WIDTH, #4
+ bge 0b
+1:
+ tst WIDTH, #2
+ beq 2f
+ bilinear_interpolate_two_pixels src_fmt, dst_fmt
+2:
+ tst WIDTH, #1
+ beq 3f
+ bilinear_interpolate_last_pixel src_fmt, dst_fmt
+3:
+ pop {r4, r5, r6, r7, r8, r9}
+ bx lr
+
+ .unreq OUT
+ .unreq TOP
+ .unreq BOTTOM
+ .unreq WT
+ .unreq WB
+ .unreq X
+ .unreq UX
+ .unreq WIDTH
+ .unreq TMP1
+ .unreq TMP2
+ .unreq PF_OFFS
+ .unreq TMP3
+ .unreq TMP4
+.endfunc
+
+.endm
diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h
index 24fa361..97adc6a 100644
--- a/pixman/pixman-arm-neon-asm.h
+++ b/pixman/pixman-arm-neon-asm.h
@@ -1158,3 +1158,20 @@ fname:
vsri.u16 out, tmp1, #5
vsri.u16 out, tmp2, #11
.endm
+
+/*
+ * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
+ * returned in (out0, out1) registers pair. Requires one temporary
+ * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
+ * value from 'in' is lost
+ */
+.macro convert_four_0565_to_x888_packed in, out0, out1, tmp
+ vshl.u16 out0, in, #5 /* G top 6 bits */
+ vshl.u16 tmp, in, #11 /* B top 5 bits */
+ vsri.u16 in, in, #5 /* R is ready in top bits */
+ vsri.u16 out0, out0, #6 /* G is ready in top bits */
+ vsri.u16 tmp, tmp, #5 /* B is ready in top bits */
+ vshr.u16 out1, in, #8 /* R is in place */
+ vsri.u16 out0, tmp, #8 /* G & B is in place */
+ vzip.u16 out0, out1 /* everything is in place */
+.endm
--
1.7.3.4

View File

@@ -0,0 +1,226 @@
From 11a0c5badbc59ce967707ef836313cc98f8aec4e Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Wed, 9 Mar 2011 11:46:48 +0200
Subject: [PATCH 18/22] ARM: use common macro template for bilinear scaled 'src_8888_8888'
This is a cleanup for old and now duplicated code. The performance improvement
is mostly coming from the enabled use of software prefetch, but instructions
scheduling is also slightly better.
Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz:
Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
before: op=1, src=20028888, dst=20028888, speed=53.24 MPix/s
after: op=1, src=20028888, dst=20028888, speed=74.36 MPix/s
---
pixman/pixman-arm-neon-asm.S | 191 +-----------------------------------------
1 files changed, 3 insertions(+), 188 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index f3784f5..52dc444 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -2405,194 +2405,6 @@ generate_composite_function_nearest_scanline \
fname:
.endm
-.macro bilinear_interpolate_last_pixel
- mov TMP1, X, asr #16
- mov TMP2, X, asr #16
- add TMP1, TOP, TMP1, asl #2
- add TMP2, BOTTOM, TMP2, asl #2
- vld1.32 {d0}, [TMP1]
- vshr.u16 d30, d24, #8
- vld1.32 {d1}, [TMP2]
- vmull.u8 q1, d0, d28
- vmlal.u8 q1, d1, d29
- /* 5 cycles bubble */
- vshll.u16 q0, d2, #8
- vmlsl.u16 q0, d2, d30
- vmlal.u16 q0, d3, d30
- /* 5 cycles bubble */
- vshrn.u32 d0, q0, #16
- /* 3 cycles bubble */
- vmovn.u16 d0, q0
- /* 1 cycle bubble */
- vst1.32 {d0[0]}, [OUT, :32]!
-.endm
-
-.macro bilinear_interpolate_two_pixels
- mov TMP1, X, asr #16
- mov TMP2, X, asr #16
- add X, X, UX
- add TMP1, TOP, TMP1, asl #2
- add TMP2, BOTTOM, TMP2, asl #2
- vld1.32 {d0}, [TMP1]
- vld1.32 {d1}, [TMP2]
- vmull.u8 q1, d0, d28
- vmlal.u8 q1, d1, d29
- mov TMP1, X, asr #16
- mov TMP2, X, asr #16
- add X, X, UX
- add TMP1, TOP, TMP1, asl #2
- add TMP2, BOTTOM, TMP2, asl #2
- vld1.32 {d20}, [TMP1]
- vld1.32 {d21}, [TMP2]
- vmull.u8 q11, d20, d28
- vmlal.u8 q11, d21, d29
- vshr.u16 q15, q12, #8
- vadd.u16 q12, q12, q13
- vshll.u16 q0, d2, #8
- vmlsl.u16 q0, d2, d30
- vmlal.u16 q0, d3, d30
- vshll.u16 q10, d22, #8
- vmlsl.u16 q10, d22, d31
- vmlal.u16 q10, d23, d31
- vshrn.u32 d30, q0, #16
- vshrn.u32 d31, q10, #16
- vmovn.u16 d0, q15
- vst1.32 {d0}, [OUT]!
-.endm
-
-.macro bilinear_interpolate_four_pixels
- mov TMP1, X, asr #16
- mov TMP2, X, asr #16
- add X, X, UX
- add TMP1, TOP, TMP1, asl #2
- add TMP2, BOTTOM, TMP2, asl #2
- vld1.32 {d0}, [TMP1]
- vld1.32 {d1}, [TMP2]
- vmull.u8 q1, d0, d28
- vmlal.u8 q1, d1, d29
- mov TMP1, X, asr #16
- mov TMP2, X, asr #16
- add X, X, UX
- add TMP1, TOP, TMP1, asl #2
- add TMP2, BOTTOM, TMP2, asl #2
- vld1.32 {d20}, [TMP1]
- vld1.32 {d21}, [TMP2]
- vmull.u8 q11, d20, d28
- vmlal.u8 q11, d21, d29
- vshr.u16 q15, q12, #8
- vadd.u16 q12, q12, q13
- vshll.u16 q0, d2, #8
- vmlsl.u16 q0, d2, d30
- vmlal.u16 q0, d3, d30
- vshll.u16 q10, d22, #8
- vmlsl.u16 q10, d22, d31
- vmlal.u16 q10, d23, d31
- mov TMP1, X, asr #16
- mov TMP2, X, asr #16
- add X, X, UX
- add TMP1, TOP, TMP1, asl #2
- add TMP2, BOTTOM, TMP2, asl #2
- vld1.32 {d4}, [TMP1]
- vld1.32 {d5}, [TMP2]
- vmull.u8 q3, d4, d28
- vmlal.u8 q3, d5, d29
- mov TMP1, X, asr #16
- mov TMP2, X, asr #16
- add X, X, UX
- add TMP1, TOP, TMP1, asl #2
- add TMP2, BOTTOM, TMP2, asl #2
- vld1.32 {d16}, [TMP1]
- vld1.32 {d17}, [TMP2]
- vmull.u8 q9, d16, d28
- vmlal.u8 q9, d17, d29
- vshr.u16 q15, q12, #8
- vadd.u16 q12, q12, q13
- vshll.u16 q2, d6, #8
- vmlsl.u16 q2, d6, d30
- vmlal.u16 q2, d7, d30
- vshll.u16 q8, d18, #8
- vmlsl.u16 q8, d18, d31
- vmlal.u16 q8, d19, d31
- vshrn.u32 d0, q0, #16
- vshrn.u32 d1, q10, #16
- vshrn.u32 d4, q2, #16
- vshrn.u32 d5, q8, #16
- vmovn.u16 d0, q0
- vmovn.u16 d1, q2
- vst1.32 {d0, d1}, [OUT]!
-.endm
-
-
-/*
- * pixman_scaled_bilinear_scanline_8888_8888_SRC (uint32_t * out,
- * const uint32_t * top,
- * const uint32_t * bottom,
- * int wt,
- * int wb,
- * pixman_fixed_t x,
- * pixman_fixed_t ux,
- * int width)
- */
-
-pixman_asm_function pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon
- OUT .req r0
- TOP .req r1
- BOTTOM .req r2
- WT .req r3
- WB .req r4
- X .req r5
- UX .req r6
- WIDTH .req ip
- TMP1 .req r3
- TMP2 .req r4
-
- mov ip, sp
- push {r4, r5, r6, r7}
- ldmia ip, {WB, X, UX, WIDTH}
-
- cmp WIDTH, #0
- ble 3f
- vdup.u16 q12, X
- vdup.u16 q13, UX
- vdup.u8 d28, WT
- vdup.u8 d29, WB
- vadd.u16 d25, d25, d26
- vadd.u16 q13, q13, q13
-
- subs WIDTH, WIDTH, #4
- blt 1f
-0:
- bilinear_interpolate_four_pixels
- subs WIDTH, WIDTH, #4
- bge 0b
-1:
- tst WIDTH, #2
- beq 2f
- bilinear_interpolate_two_pixels
-2:
- tst WIDTH, #1
- beq 3f
- bilinear_interpolate_last_pixel
-3:
- pop {r4, r5, r6, r7}
- bx lr
-
- .unreq OUT
- .unreq TOP
- .unreq BOTTOM
- .unreq WT
- .unreq WB
- .unreq X
- .unreq UX
- .unreq WIDTH
- .unreq TMP1
- .unreq TMP2
-.endfunc
-
-.purgem bilinear_interpolate_last_pixel
-.purgem bilinear_interpolate_two_pixels
-.purgem bilinear_interpolate_four_pixels
-
/*
* Bilinear scaling support code which tries to provide pixel fetching, color
* format conversion, and interpolation as separate macros which can be used
@@ -2810,3 +2622,6 @@ pixman_asm_function fname
.endfunc
.endm
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, 2, 28
--
1.7.3.4

View File

@@ -0,0 +1,51 @@
From 2ee27e7d79637da9173ee1bf3423e5a81534ccb4 Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Wed, 9 Mar 2011 11:53:04 +0200
Subject: [PATCH 19/22] ARM: NEON optimization for bilinear scaled 'src_8888_0565'
Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz:
Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
before: op=1, src=20028888, dst=10020565, speed=6.56 MPix/s
after: op=1, src=20028888, dst=10020565, speed=61.65 MPix/s
---
pixman/pixman-arm-neon-asm.S | 3 +++
pixman/pixman-arm-neon.c | 5 +++++
2 files changed, 8 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 52dc444..f0b42ca 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -2625,3 +2625,6 @@ pixman_asm_function fname
generate_bilinear_scanline_func \
pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, 2, 28
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, 2, 28
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 98ad5f2..ba6de66 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -129,6 +129,8 @@ PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, neon, 0565_8_0565,
PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_8888, SRC,
uint32_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_0565, SRC,
+ uint32_t, uint16_t)
void
pixman_composite_src_n_8_asm_neon (int32_t w,
@@ -350,6 +352,9 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, neon_8888_8888),
SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, neon_8888_8888),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, r5g6b5, neon_8888_0565),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_0565),
+
{ PIXMAN_OP_NONE },
};
--
1.7.3.4

View File

@@ -0,0 +1,50 @@
From 29003c3befe2159396d181ef9ac1caaadcabf382 Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Wed, 9 Mar 2011 13:21:53 +0200
Subject: [PATCH 20/22] ARM: NEON optimization for bilinear scaled 'src_0565_x888'
Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz:
Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
before: op=1, src=10020565, dst=20020888, speed=3.39 MPix/s
after: op=1, src=10020565, dst=20020888, speed=36.82 MPix/s
---
pixman/pixman-arm-neon-asm.S | 3 +++
pixman/pixman-arm-neon.c | 4 ++++
2 files changed, 7 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index f0b42ca..9245db9 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -2628,3 +2628,6 @@ generate_bilinear_scanline_func \
generate_bilinear_scanline_func \
pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, 2, 28
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, 1, 28
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index ba6de66..18e26eb 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -131,6 +131,8 @@ PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_8888, SRC,
uint32_t, uint32_t)
PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_0565, SRC,
uint32_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_x888, SRC,
+ uint16_t, uint32_t)
void
pixman_composite_src_n_8_asm_neon (int32_t w,
@@ -355,6 +357,8 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, r5g6b5, neon_8888_0565),
SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_0565),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_x888),
+
{ PIXMAN_OP_NONE },
};
--
1.7.3.4

View File

@@ -0,0 +1,49 @@
From fe99673719091d4a880d031add1369332a75731b Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Wed, 9 Mar 2011 13:27:41 +0200
Subject: [PATCH 21/22] ARM: NEON optimization for bilinear scaled 'src_0565_0565'
Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz:
Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
before: op=1, src=10020565, dst=10020565, speed=3.30 MPix/s
after: op=1, src=10020565, dst=10020565, speed=32.29 MPix/s
---
pixman/pixman-arm-neon-asm.S | 3 +++
pixman/pixman-arm-neon.c | 3 +++
2 files changed, 6 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 9245db9..2b6875b 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -2631,3 +2631,6 @@ generate_bilinear_scanline_func \
generate_bilinear_scanline_func \
pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, 1, 28
+
+generate_bilinear_scanline_func \
+ pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, 1, 28
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 18e26eb..0a10ca1 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -133,6 +133,8 @@ PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_0565, SRC,
uint32_t, uint16_t)
PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_x888, SRC,
uint16_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_0565, SRC,
+ uint16_t, uint16_t)
void
pixman_composite_src_n_8_asm_neon (int32_t w,
@@ -358,6 +360,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_0565),
SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_x888),
+ SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, r5g6b5, neon_0565_0565),
{ PIXMAN_OP_NONE },
};
--
1.7.3.4

View File

@@ -0,0 +1,166 @@
From 70a923882ca24664344ba91a649e7aa12c3063f7 Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Wed, 9 Mar 2011 13:55:48 +0200
Subject: [PATCH 22/22] ARM: a bit faster NEON bilinear scaling for r5g6b5 source images
Instructions scheduling improved in the code responsible for fetching r5g6b5
pixels and converting them to the intermediate x8r8g8b8 color format used in
the interpolation part of code. Still a lot of NEON stalls are remaining,
which can be resolved later by the use of pipelining.
Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz:
Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
before: op=1, src=10020565, dst=10020565, speed=32.29 MPix/s
op=1, src=10020565, dst=20020888, speed=36.82 MPix/s
after: op=1, src=10020565, dst=10020565, speed=41.35 MPix/s
op=1, src=10020565, dst=20020888, speed=49.16 MPix/s
---
pixman/pixman-arm-neon-asm.S | 118 +++++++++++++++++++++++++++++++++++------
1 files changed, 100 insertions(+), 18 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 2b6875b..71b30ac 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -2430,6 +2430,101 @@ fname:
convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
.endm
+.macro bilinear_load_and_vertical_interpolate_two_8888 \
+ acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
+
+ bilinear_load_8888 reg1, reg2, tmp1
+ vmull.u8 acc1, reg1, d28
+ vmlal.u8 acc1, reg2, d29
+ bilinear_load_8888 reg3, reg4, tmp2
+ vmull.u8 acc2, reg3, d28
+ vmlal.u8 acc2, reg4, d29
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_8888 \
+ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
+ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+ bilinear_load_and_vertical_interpolate_two_8888 \
+ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
+ bilinear_load_and_vertical_interpolate_two_8888 \
+ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_two_0565 \
+ acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
+
+ mov TMP2, X, asr #16
+ add X, X, UX
+ mov TMP4, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP2, asl #1
+ add TMP2, BOTTOM, TMP2, asl #1
+ add TMP3, TOP, TMP4, asl #1
+ add TMP4, BOTTOM, TMP4, asl #1
+ vld1.32 {acc2lo[0]}, [TMP1]
+ vld1.32 {acc2hi[0]}, [TMP3]
+ vld1.32 {acc2lo[1]}, [TMP2]
+ vld1.32 {acc2hi[1]}, [TMP4]
+ convert_0565_to_x888 acc2, reg3, reg2, reg1
+ vzip.u8 reg1, reg3
+ vzip.u8 reg2, reg4
+ vzip.u8 reg3, reg4
+ vzip.u8 reg1, reg2
+ vmull.u8 acc1, reg1, d28
+ vmlal.u8 acc1, reg2, d29
+ vmull.u8 acc2, reg3, d28
+ vmlal.u8 acc2, reg4, d29
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_0565 \
+ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
+ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+ mov TMP2, X, asr #16
+ add X, X, UX
+ mov TMP4, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP2, asl #1
+ add TMP2, BOTTOM, TMP2, asl #1
+ add TMP3, TOP, TMP4, asl #1
+ add TMP4, BOTTOM, TMP4, asl #1
+ vld1.32 {xacc2lo[0]}, [TMP1]
+ vld1.32 {xacc2hi[0]}, [TMP3]
+ vld1.32 {xacc2lo[1]}, [TMP2]
+ vld1.32 {xacc2hi[1]}, [TMP4]
+ convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
+ mov TMP2, X, asr #16
+ add X, X, UX
+ mov TMP4, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP2, asl #1
+ add TMP2, BOTTOM, TMP2, asl #1
+ add TMP3, TOP, TMP4, asl #1
+ add TMP4, BOTTOM, TMP4, asl #1
+ vld1.32 {yacc2lo[0]}, [TMP1]
+ vzip.u8 xreg1, xreg3
+ vld1.32 {yacc2hi[0]}, [TMP3]
+ vzip.u8 xreg2, xreg4
+ vld1.32 {yacc2lo[1]}, [TMP2]
+ vzip.u8 xreg3, xreg4
+ vld1.32 {yacc2hi[1]}, [TMP4]
+ vzip.u8 xreg1, xreg2
+ convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
+ vmull.u8 xacc1, xreg1, d28
+ vzip.u8 yreg1, yreg3
+ vmlal.u8 xacc1, xreg2, d29
+ vzip.u8 yreg2, yreg4
+ vmull.u8 xacc2, xreg3, d28
+ vzip.u8 yreg3, yreg4
+ vmlal.u8 xacc2, xreg4, d29
+ vzip.u8 yreg1, yreg2
+ vmull.u8 yacc1, yreg1, d28
+ vmlal.u8 yacc1, yreg2, d29
+ vmull.u8 yacc2, yreg3, d28
+ vmlal.u8 yacc2, yreg4, d29
+.endm
+
.macro bilinear_store_8888 numpix, tmp1, tmp2
.if numpix == 4
vst1.32 {d0, d1}, [OUT]!
@@ -2477,12 +2572,8 @@ fname:
.endm
.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
- bilinear_load_&src_fmt d0, d1, d2
- vmull.u8 q1, d0, d28
- vmlal.u8 q1, d1, d29
- bilinear_load_&src_fmt d20, d21, d22
- vmull.u8 q11, d20, d28
- vmlal.u8 q11, d21, d29
+ bilinear_load_and_vertical_interpolate_two_&src_fmt \
+ q1, q11, d0, d1, d20, d21, d22, d23
vshr.u16 q15, q12, #8
vadd.u16 q12, q12, q13
vshll.u16 q0, d2, #8
@@ -2498,18 +2589,9 @@ fname:
.endm
.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
- bilinear_load_&src_fmt d0, d1, d2
- vmull.u8 q1, d0, d28
- vmlal.u8 q1, d1, d29
- bilinear_load_&src_fmt d20, d21, d22
- vmull.u8 q11, d20, d28
- vmlal.u8 q11, d21, d29
- bilinear_load_&src_fmt d4, d5, d6
- vmull.u8 q3, d4, d28
- vmlal.u8 q3, d5, d29
- bilinear_load_&src_fmt d16, d17, d18
- vmull.u8 q9, d16, d28
- vmlal.u8 q9, d17, d29
+ bilinear_load_and_vertical_interpolate_four_&src_fmt \
+ q1, q11, d0, d1, d20, d21, d22, d23 \
+ q3, q9, d4, d5, d16, d17, d18, d19
pld [TMP1, PF_OFFS]
vshr.u16 q15, q12, #8
vadd.u16 q12, q12, q13
--
1.7.3.4

View File

@@ -0,0 +1,74 @@
# Copyright 1999-2011 Gentoo Foundation
# Distributed under the terms of the GNU General Public License v2
# $Header: $
EAPI=3
inherit xorg-2 toolchain-funcs versionator
EGIT_REPO_URI="git://anongit.freedesktop.org/git/pixman"
DESCRIPTION="Low-level pixel manipulation routines"
KEYWORDS="~arm"
IUSE="altivec mmx sse2 simd neon"
pkg_setup() {
xorg-2_pkg_setup
CONFIGURE_OPTIONS="
$(use_enable altivec vmx)
$(use_enable simd arm-simd)
$(use_enable neon arm-neon)
--disable-gtk"
local enable_mmx="$(use mmx && echo 1 || echo 0)"
local enable_sse2="$(use sse2 && echo 1 || echo 0)"
# this block fixes bug #260287
if use x86; then
if use sse2 && ! $(version_is_at_least "4.2" "$(gcc-version)"); then
ewarn "SSE2 instructions require GCC 4.2 or higher."
ewarn "pixman will be built *without* SSE2 support"
enable_sse2="0"
fi
fi
# this block fixes bug #236558
case "$enable_mmx,$enable_sse2" in
'1,1')
CONFIGURE_OPTIONS="${CONFIGURE_OPTIONS} --enable-mmx --enable-sse2" ;;
'1,0')
CONFIGURE_OPTIONS="${CONFIGURE_OPTIONS} --enable-mmx --disable-sse2" ;;
'0,1')
ewarn "You enabled SSE2 but have MMX disabled. This is an invalid."
ewarn "pixman will be built *without* MMX/SSE2 support."
CONFIGURE_OPTIONS="${CONFIGURE_OPTIONS} --disable-mmx --disable-sse2" ;;
'0,0')
CONFIGURE_OPTIONS="${CONFIGURE_OPTIONS} --disable-mmx --disable-sse2" ;;
esac
}
src_prepare() {
epatch "${FILESDIR}"/0002-Fix-compilation-on-Win32.patch
epatch "${FILESDIR}"/0003-test-Fix-tests-for-compilation-on-Windows.patch
epatch "${FILESDIR}"/0004-test-Add-Makefile-for-Win32.patch
epatch "${FILESDIR}"/0005-Do-not-include-unused-headers.patch
epatch "${FILESDIR}"/0006-test-Silence-MSVC-warnings.patch
epatch "${FILESDIR}"/0007-Main-loop-template-for-fast-single-pass-bilinear-sca.patch
epatch "${FILESDIR}"/0008-test-check-correctness-of-bilinear_pad_repeat_get_sc.patch
epatch "${FILESDIR}"/0009-SSE2-optimization-for-bilinear-scaled-src_8888_8888.patch
epatch "${FILESDIR}"/0010-ARM-NEON-optimization-for-bilinear-scaled-src_8888_8.patch
epatch "${FILESDIR}"/0011-test-In-image_endian_swap-use-pixman_image_get_forma.patch
epatch "${FILESDIR}"/0012-test-Do-endian-swapping-of-the-source-and-destinatio.patch
epatch "${FILESDIR}"/0013-ARM-use-prefetch-in-nearest-scaled-src_0565_0565.patch
epatch "${FILESDIR}"/0014-ARM-common-macro-for-nearest-scaling-fast-paths.patch
epatch "${FILESDIR}"/0015-ARM-assembly-optimized-nearest-scaled-src_8888_8888.patch
epatch "${FILESDIR}"/0016-ARM-new-bilinear-fast-path-template-macro-in-pixman-.patch
epatch "${FILESDIR}"/0017-ARM-NEON-common-macro-template-for-bilinear-scanline.patch
epatch "${FILESDIR}"/0018-ARM-use-common-macro-template-for-bilinear-scaled-sr.patch
epatch "${FILESDIR}"/0019-ARM-NEON-optimization-for-bilinear-scaled-src_8888_0.patch
epatch "${FILESDIR}"/0020-ARM-NEON-optimization-for-bilinear-scaled-src_0565_x.patch
epatch "${FILESDIR}"/0021-ARM-NEON-optimization-for-bilinear-scaled-src_0565_0.patch
epatch "${FILESDIR}"/0022-ARM-a-bit-faster-NEON-bilinear-scaling-for-r5g6b5-so.patch
# We patch Makefile.am and such, so eautoreconf!
eautoreconf
}