From 5f1a0320b92a60ee1283522135e00bff540ea115 Mon Sep 17 00:00:00 2001 From: "Paul E. Murphy" Date: Mon, 27 Feb 2023 16:04:50 -0600 Subject: [PATCH] internal/bytealg: rewrite PPC64 Compare MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Merge the P8 and P9 paths into one. This removes the need for a runtime CPU check and maintaining two separate code paths. This takes advantage of overlapping checks, and the P9 SETB (emulated with little overhead on P8) to speed up comparisons of small strings. Similarly, the SETB instruction can be used on GOPPC64=power9 which provides a small speedup over using a couple ISELs. This only accounts for a few percent on very small strings, thus results of running P8 codegen on P9 are left out. For the baseline on a power8 machine: BytesCompare/1 7.76ns ± 0% 6.38ns ± 0% -17.71% BytesCompare/2 7.77ns ± 0% 6.36ns ± 0% -18.12% BytesCompare/3 7.56ns ± 0% 6.36ns ± 0% -15.79% BytesCompare/4 7.76ns ± 0% 5.74ns ± 0% -25.99% BytesCompare/5 7.48ns ± 0% 5.74ns ± 0% -23.29% BytesCompare/6 7.56ns ± 0% 5.74ns ± 0% -24.06% BytesCompare/7 7.14ns ± 0% 5.74ns ± 0% -19.63% BytesCompare/8 5.58ns ± 0% 5.19ns ± 0% -7.03% BytesCompare/9 7.85ns ± 0% 5.19ns ± 0% -33.86% BytesCompare/10 7.87ns ± 0% 5.19ns ± 0% -34.06% BytesCompare/11 7.59ns ± 0% 5.19ns ± 0% -31.59% BytesCompare/12 7.87ns ± 0% 5.19ns ± 0% -34.02% BytesCompare/13 7.55ns ± 0% 5.19ns ± 0% -31.24% BytesCompare/14 7.47ns ± 0% 5.19ns ± 0% -30.53% BytesCompare/15 7.88ns ± 0% 5.19ns ± 0% -34.09% BytesCompare/16 6.07ns ± 0% 5.58ns ± 0% -8.08% BytesCompare/17 9.05ns ± 0% 5.62ns ± 0% -37.94% BytesCompare/18 8.95ns ± 0% 5.62ns ± 0% -37.24% BytesCompare/19 8.49ns ± 0% 5.62ns ± 0% -33.81% BytesCompare/20 9.07ns ± 0% 5.62ns ± 0% -38.05% BytesCompare/21 8.69ns ± 0% 5.62ns ± 0% -35.37% BytesCompare/22 8.57ns ± 0% 5.62ns ± 0% -34.43% BytesCompare/23 8.31ns ± 0% 5.62ns ± 0% -32.38% BytesCompare/24 8.42ns ± 0% 5.62ns ± 0% -33.23% BytesCompare/25 9.70ns ± 0% 5.56ns ± 0% -42.69% BytesCompare/26 9.53ns ± 0% 5.56ns ± 0% -41.66% BytesCompare/27 9.29ns ± 0% 5.56ns ± 0% -40.15% BytesCompare/28 9.53ns ± 0% 5.56ns ± 0% -41.65% BytesCompare/29 9.37ns ± 0% 5.56ns ± 0% -40.63% BytesCompare/30 9.17ns ± 0% 5.56ns ± 0% -39.36% BytesCompare/31 9.07ns ± 0% 5.56ns ± 0% -38.71% BytesCompare/32 5.81ns ± 0% 5.49ns ± 0% -5.49% BytesCompare/33 9.36ns ± 0% 5.32ns ± 0% -43.17% BytesCompare/34 9.44ns ± 0% 5.32ns ± 0% -43.68% BytesCompare/35 8.91ns ± 0% 5.32ns ± 0% -40.29% BytesCompare/36 9.45ns ± 0% 5.32ns ± 0% -43.71% BytesCompare/37 8.94ns ± 0% 5.32ns ± 0% -40.53% BytesCompare/38 9.08ns ± 0% 5.32ns ± 0% -41.44% BytesCompare/39 8.62ns ± 0% 5.32ns ± 0% -38.33% BytesCompare/40 7.93ns ± 0% 5.32ns ± 0% -32.93% BytesCompare/41 10.1ns ± 0% 5.3ns ± 0% -47.08% BytesCompare/42 10.1ns ± 0% 5.3ns ± 0% -47.43% BytesCompare/43 9.80ns ± 0% 5.32ns ± 0% -45.66% BytesCompare/44 10.3ns ± 0% 5.3ns ± 0% -48.26% BytesCompare/45 9.88ns ± 0% 5.33ns ± 0% -46.08% BytesCompare/46 9.82ns ± 0% 5.32ns ± 0% -45.81% BytesCompare/47 9.73ns ± 0% 5.33ns ± 0% -45.25% BytesCompare/48 8.31ns ± 0% 5.22ns ± 0% -37.19% BytesCompare/49 11.2ns ± 0% 5.2ns ± 0% -53.28% BytesCompare/50 11.1ns ± 0% 5.2ns ± 0% -52.86% BytesCompare/51 10.8ns ± 0% 5.2ns ± 0% -51.37% BytesCompare/52 11.1ns ± 0% 5.2ns ± 0% -52.94% BytesCompare/53 10.8ns ± 0% 5.2ns ± 0% -51.50% BytesCompare/54 10.7ns ± 0% 5.2ns ± 0% -51.09% BytesCompare/55 10.3ns ± 0% 5.2ns ± 0% -49.49% BytesCompare/56 10.9ns ± 0% 5.2ns ± 0% -51.73% BytesCompare/57 12.2ns ± 0% 5.3ns ± 0% -56.92% BytesCompare/58 12.2ns ± 0% 5.3ns ± 0% -56.81% BytesCompare/59 11.5ns ± 0% 5.3ns ± 0% -54.45% BytesCompare/60 12.1ns ± 0% 5.3ns ± 0% -56.67% BytesCompare/61 11.7ns ± 0% 5.3ns ± 0% -54.96% BytesCompare/62 11.9ns ± 0% 5.3ns ± 0% -55.76% BytesCompare/63 11.4ns ± 0% 5.3ns ± 0% -53.73% BytesCompare/64 6.08ns ± 0% 5.47ns ± 0% -9.96% BytesCompare/65 9.87ns ± 0% 5.96ns ± 0% -39.57% BytesCompare/66 9.81ns ± 0% 5.96ns ± 0% -39.25% BytesCompare/67 9.49ns ± 0% 5.96ns ± 0% -37.18% BytesCompare/68 9.81ns ± 0% 5.96ns ± 0% -39.26% BytesCompare/69 9.44ns ± 0% 5.96ns ± 0% -36.84% BytesCompare/70 9.58ns ± 0% 5.96ns ± 0% -37.75% BytesCompare/71 9.24ns ± 0% 5.96ns ± 0% -35.50% BytesCompare/72 8.26ns ± 0% 5.94ns ± 0% -28.09% BytesCompare/73 10.6ns ± 0% 5.9ns ± 0% -43.70% BytesCompare/74 10.6ns ± 0% 5.9ns ± 0% -43.87% BytesCompare/75 10.2ns ± 0% 5.9ns ± 0% -41.83% BytesCompare/76 10.7ns ± 0% 5.9ns ± 0% -44.55% BytesCompare/77 10.3ns ± 0% 5.9ns ± 0% -42.51% BytesCompare/78 10.3ns ± 0% 5.9ns ± 0% -42.29% BytesCompare/79 10.2ns ± 0% 5.9ns ± 0% -41.95% BytesCompare/80 8.74ns ± 0% 5.93ns ± 0% -32.23% BytesCompare/81 11.7ns ± 0% 6.8ns ± 0% -41.87% BytesCompare/82 11.7ns ± 0% 6.8ns ± 0% -41.54% BytesCompare/83 11.1ns ± 0% 6.8ns ± 0% -38.32% BytesCompare/84 11.7ns ± 0% 6.8ns ± 0% -41.59% BytesCompare/85 11.2ns ± 0% 6.8ns ± 0% -38.93% BytesCompare/86 11.2ns ± 0% 6.8ns ± 0% -38.87% BytesCompare/87 10.8ns ± 0% 6.8ns ± 0% -37.07% BytesCompare/88 11.3ns ± 0% 6.7ns ± 0% -40.57% BytesCompare/89 12.6ns ± 0% 6.7ns ± 0% -46.57% BytesCompare/90 12.6ns ± 0% 6.7ns ± 0% -46.44% BytesCompare/91 11.9ns ± 0% 6.7ns ± 0% -43.66% BytesCompare/92 12.5ns ± 0% 6.7ns ± 0% -46.09% BytesCompare/93 12.2ns ± 0% 6.7ns ± 0% -44.90% BytesCompare/94 12.4ns ± 0% 6.7ns ± 0% -45.62% BytesCompare/95 11.8ns ± 0% 6.7ns ± 0% -43.00% BytesCompare/96 7.25ns ± 0% 6.62ns ± 0% -8.70% BytesCompare/97 11.1ns ± 0% 7.2ns ± 0% -34.98% BytesCompare/98 10.9ns ± 0% 7.2ns ± 0% -34.03% BytesCompare/99 10.4ns ± 0% 7.2ns ± 0% -31.19% BytesCompare/100 10.9ns ± 0% 7.2ns ± 0% -33.97% BytesCompare/101 10.4ns ± 0% 7.2ns ± 0% -31.19% BytesCompare/102 10.7ns ± 0% 7.2ns ± 0% -32.72% BytesCompare/103 10.2ns ± 0% 7.2ns ± 0% -29.28% BytesCompare/104 9.38ns ± 0% 7.19ns ± 0% -23.33% BytesCompare/105 11.7ns ± 0% 7.2ns ± 0% -38.60% BytesCompare/106 11.7ns ± 0% 7.2ns ± 0% -38.28% BytesCompare/107 11.3ns ± 0% 7.2ns ± 0% -36.48% BytesCompare/108 11.7ns ± 0% 7.2ns ± 0% -38.49% BytesCompare/109 11.4ns ± 0% 7.2ns ± 0% -36.76% BytesCompare/110 11.3ns ± 0% 7.2ns ± 0% -36.37% BytesCompare/111 11.1ns ± 0% 7.2ns ± 0% -35.05% BytesCompare/112 9.95ns ± 0% 7.19ns ± 0% -27.71% BytesCompare/113 12.7ns ± 0% 7.0ns ± 0% -44.71% BytesCompare/114 12.6ns ± 0% 7.0ns ± 0% -44.23% BytesCompare/115 12.3ns ± 0% 7.0ns ± 0% -42.83% BytesCompare/116 12.7ns ± 0% 7.0ns ± 0% -44.67% BytesCompare/117 12.2ns ± 0% 7.0ns ± 0% -42.41% BytesCompare/118 12.2ns ± 0% 7.0ns ± 0% -42.50% BytesCompare/119 11.9ns ± 0% 7.0ns ± 0% -40.76% BytesCompare/120 12.3ns ± 0% 7.0ns ± 0% -43.01% BytesCompare/121 13.7ns ± 0% 7.0ns ± 0% -48.55% BytesCompare/122 13.6ns ± 0% 7.0ns ± 0% -48.06% BytesCompare/123 12.9ns ± 0% 7.0ns ± 0% -45.44% BytesCompare/124 13.5ns ± 0% 7.0ns ± 0% -47.91% BytesCompare/125 13.0ns ± 0% 7.0ns ± 0% -46.03% BytesCompare/126 13.2ns ± 0% 7.0ns ± 0% -46.72% BytesCompare/127 12.9ns ± 0% 7.0ns ± 0% -45.36% BytesCompare/128 7.53ns ± 0% 6.78ns ± 0% -9.95% BytesCompare/256 10.1ns ± 0% 9.6ns ± 0% -4.35% BytesCompare/512 23.0ns ± 0% 15.3ns ± 0% -33.30% BytesCompare/1024 36.4ns ± 0% 32.8ns ± 0% -9.83% BytesCompare/2048 62.0ns ± 0% 56.0ns ± 0% -9.77% For GOPPC64=power9 on power9: BytesCompare/1 5.95ns ± 0% 4.83ns ± 0% -18.89% BytesCompare/2 6.37ns ± 0% 4.69ns ± 0% -26.39% BytesCompare/3 6.87ns ± 0% 4.68ns ± 0% -31.79% BytesCompare/4 5.86ns ± 0% 4.63ns ± 0% -20.98% BytesCompare/5 5.84ns ± 0% 4.63ns ± 0% -20.67% BytesCompare/6 5.84ns ± 0% 4.63ns ± 0% -20.70% BytesCompare/7 5.82ns ± 0% 4.63ns ± 0% -20.40% BytesCompare/8 5.81ns ± 0% 4.64ns ± 0% -20.23% BytesCompare/9 5.83ns ± 0% 4.71ns ± 0% -19.19% BytesCompare/10 6.22ns ± 0% 4.71ns ± 0% -24.32% BytesCompare/11 6.94ns ± 0% 4.71ns ± 0% -32.16% BytesCompare/12 5.77ns ± 0% 4.71ns ± 0% -18.34% BytesCompare/13 5.77ns ± 0% 4.71ns ± 0% -18.44% BytesCompare/14 5.77ns ± 0% 4.71ns ± 0% -18.31% BytesCompare/15 6.31ns ± 0% 4.71ns ± 0% -25.32% BytesCompare/16 4.99ns ± 0% 5.03ns ± 0% +0.72% BytesCompare/17 5.07ns ± 0% 5.03ns ± 0% -0.87% BytesCompare/18 5.07ns ± 0% 5.03ns ± 0% -0.81% BytesCompare/19 5.07ns ± 0% 5.03ns ± 0% -0.85% BytesCompare/20 5.07ns ± 0% 5.03ns ± 0% -0.73% BytesCompare/21 5.07ns ± 0% 5.03ns ± 0% -0.81% BytesCompare/22 5.07ns ± 0% 5.03ns ± 0% -0.77% BytesCompare/23 5.07ns ± 0% 5.03ns ± 0% -0.75% BytesCompare/24 5.08ns ± 0% 5.07ns ± 0% -0.12% BytesCompare/25 5.03ns ± 0% 5.00ns ± 0% -0.60% BytesCompare/26 5.02ns ± 0% 5.00ns ± 0% -0.56% BytesCompare/27 5.03ns ± 0% 5.00ns ± 0% -0.60% BytesCompare/28 5.03ns ± 0% 5.00ns ± 0% -0.72% BytesCompare/29 5.03ns ± 0% 5.00ns ± 0% -0.68% BytesCompare/30 5.03ns ± 0% 5.00ns ± 0% -0.76% BytesCompare/31 5.03ns ± 0% 5.00ns ± 0% -0.60% BytesCompare/32 5.02ns ± 0% 5.05ns ± 0% +0.56% BytesCompare/33 6.78ns ± 0% 5.16ns ± 0% -23.84% BytesCompare/34 7.26ns ± 0% 5.16ns ± 0% -28.93% BytesCompare/35 7.78ns ± 0% 5.16ns ± 0% -33.65% BytesCompare/36 6.72ns ± 0% 5.16ns ± 0% -23.24% BytesCompare/37 7.32ns ± 0% 5.16ns ± 0% -29.55% BytesCompare/38 7.26ns ± 0% 5.16ns ± 0% -28.95% BytesCompare/39 7.99ns ± 0% 5.16ns ± 0% -35.40% BytesCompare/40 6.67ns ± 0% 5.11ns ± 0% -23.41% BytesCompare/41 7.25ns ± 0% 5.14ns ± 0% -29.05% BytesCompare/42 7.47ns ± 0% 5.14ns ± 0% -31.11% BytesCompare/43 7.97ns ± 0% 5.14ns ± 0% -35.42% BytesCompare/44 7.29ns ± 0% 5.14ns ± 0% -29.38% BytesCompare/45 8.06ns ± 0% 5.14ns ± 0% -36.20% BytesCompare/46 7.89ns ± 0% 5.14ns ± 0% -34.77% BytesCompare/47 8.59ns ± 0% 5.14ns ± 0% -40.13% BytesCompare/48 5.57ns ± 0% 5.12ns ± 0% -8.18% BytesCompare/49 6.05ns ± 0% 5.17ns ± 0% -14.48% BytesCompare/50 6.05ns ± 0% 5.17ns ± 0% -14.51% BytesCompare/51 6.06ns ± 0% 5.17ns ± 0% -14.61% BytesCompare/52 6.05ns ± 0% 5.17ns ± 0% -14.54% BytesCompare/53 6.06ns ± 0% 5.17ns ± 0% -14.56% BytesCompare/54 6.05ns ± 0% 5.17ns ± 0% -14.54% BytesCompare/55 6.05ns ± 0% 5.17ns ± 0% -14.54% BytesCompare/56 6.02ns ± 0% 5.11ns ± 0% -15.13% BytesCompare/57 6.01ns ± 0% 5.14ns ± 0% -14.56% BytesCompare/58 6.02ns ± 0% 5.14ns ± 0% -14.59% BytesCompare/59 6.02ns ± 0% 5.14ns ± 0% -14.65% BytesCompare/60 6.03ns ± 0% 5.14ns ± 0% -14.71% BytesCompare/61 6.02ns ± 0% 5.14ns ± 0% -14.69% BytesCompare/62 6.01ns ± 0% 5.14ns ± 0% -14.55% BytesCompare/63 6.02ns ± 0% 5.14ns ± 0% -14.65% BytesCompare/64 6.09ns ± 0% 5.15ns ± 0% -15.34% BytesCompare/65 7.83ns ± 0% 5.93ns ± 0% -24.17% BytesCompare/66 7.86ns ± 0% 5.93ns ± 0% -24.52% BytesCompare/67 8.56ns ± 0% 5.93ns ± 0% -30.68% BytesCompare/68 7.90ns ± 0% 5.93ns ± 0% -24.88% BytesCompare/69 8.58ns ± 0% 5.93ns ± 0% -30.84% BytesCompare/70 8.54ns ± 0% 5.93ns ± 0% -30.48% BytesCompare/71 9.18ns ± 0% 5.94ns ± 0% -35.34% BytesCompare/72 7.89ns ± 0% 5.86ns ± 0% -25.76% BytesCompare/73 8.59ns ± 0% 5.82ns ± 0% -32.25% BytesCompare/74 8.52ns ± 0% 5.82ns ± 0% -31.61% BytesCompare/75 9.17ns ± 0% 5.82ns ± 0% -36.50% BytesCompare/76 8.54ns ± 0% 5.82ns ± 0% -31.85% BytesCompare/77 9.25ns ± 0% 5.82ns ± 0% -37.07% BytesCompare/78 9.17ns ± 0% 5.82ns ± 0% -36.48% BytesCompare/79 10.0ns ± 0% 5.8ns ± 0% -41.66% BytesCompare/80 6.76ns ± 0% 5.69ns ± 0% -15.90% BytesCompare/81 7.63ns ± 0% 6.70ns ± 0% -12.23% BytesCompare/82 7.63ns ± 0% 6.70ns ± 0% -12.23% BytesCompare/83 7.63ns ± 0% 6.70ns ± 0% -12.24% BytesCompare/84 7.63ns ± 0% 6.70ns ± 0% -12.24% BytesCompare/85 7.63ns ± 0% 6.70ns ± 0% -12.23% BytesCompare/86 7.63ns ± 0% 6.70ns ± 0% -12.24% BytesCompare/87 7.63ns ± 0% 6.70ns ± 0% -12.24% BytesCompare/88 7.53ns ± 0% 6.56ns ± 0% -12.90% BytesCompare/89 7.53ns ± 0% 6.55ns ± 0% -12.93% BytesCompare/90 7.53ns ± 0% 6.55ns ± 0% -12.93% BytesCompare/91 7.53ns ± 0% 6.55ns ± 0% -12.93% BytesCompare/92 7.53ns ± 0% 6.55ns ± 0% -12.93% BytesCompare/93 7.53ns ± 0% 6.55ns ± 0% -12.93% BytesCompare/94 7.53ns ± 0% 6.55ns ± 0% -12.93% BytesCompare/95 7.53ns ± 0% 6.55ns ± 0% -12.94% BytesCompare/96 7.02ns ± 0% 6.45ns ± 0% -8.09% BytesCompare/97 8.73ns ± 0% 7.39ns ± 0% -15.35% BytesCompare/98 8.71ns ± 0% 7.39ns ± 0% -15.15% BytesCompare/99 9.42ns ± 0% 7.39ns ± 0% -21.57% BytesCompare/100 8.73ns ± 0% 7.39ns ± 0% -15.36% BytesCompare/101 9.43ns ± 0% 7.39ns ± 0% -21.70% BytesCompare/102 9.42ns ± 0% 7.39ns ± 0% -21.59% BytesCompare/103 10.2ns ± 0% 7.4ns ± 0% -27.58% BytesCompare/104 8.74ns ± 0% 7.35ns ± 0% -15.95% BytesCompare/105 9.44ns ± 0% 7.30ns ± 0% -22.67% BytesCompare/106 9.44ns ± 0% 7.30ns ± 0% -22.69% BytesCompare/107 10.2ns ± 0% 7.3ns ± 0% -28.53% BytesCompare/108 9.48ns ± 0% 7.30ns ± 0% -23.04% BytesCompare/109 10.2ns ± 0% 7.3ns ± 0% -28.81% BytesCompare/110 10.2ns ± 0% 7.3ns ± 0% -28.39% BytesCompare/111 10.9ns ± 0% 7.3ns ± 0% -33.18% BytesCompare/112 7.75ns ± 0% 7.16ns ± 0% -7.60% BytesCompare/113 8.57ns ± 0% 7.83ns ± 0% -8.60% BytesCompare/114 8.57ns ± 0% 7.83ns ± 0% -8.63% BytesCompare/115 8.57ns ± 0% 7.83ns ± 0% -8.56% BytesCompare/116 8.57ns ± 0% 7.83ns ± 0% -8.57% BytesCompare/117 8.57ns ± 0% 7.83ns ± 0% -8.56% BytesCompare/118 8.57ns ± 0% 7.83ns ± 0% -8.56% BytesCompare/119 8.57ns ± 0% 7.83ns ± 0% -8.61% BytesCompare/120 8.46ns ± 0% 7.71ns ± 0% -8.80% BytesCompare/121 8.46ns ± 0% 7.72ns ± 0% -8.77% BytesCompare/122 8.46ns ± 0% 7.72ns ± 0% -8.78% BytesCompare/123 8.46ns ± 0% 7.72ns ± 0% -8.76% BytesCompare/124 8.46ns ± 0% 7.72ns ± 0% -8.70% BytesCompare/125 8.46ns ± 0% 7.72ns ± 0% -8.70% BytesCompare/126 8.46ns ± 0% 7.72ns ± 0% -8.70% BytesCompare/127 8.46ns ± 0% 7.72ns ± 0% -8.71% BytesCompare/128 8.19ns ± 0% 7.35ns ± 0% -10.29% BytesCompare/256 12.8ns ± 0% 11.4ns ± 0% -11.23% BytesCompare/512 22.2ns ± 0% 20.7ns ± 0% -6.80% BytesCompare/1024 41.1ns ± 0% 39.8ns ± 0% -3.12% BytesCompare/2048 86.5ns ± 0% 81.1ns ± 0% -6.31% Change-Id: I7c7fb1f7b891c23c6cade580e7b9928ca1a6efc3 Reviewed-on: https://go-review.googlesource.com/c/go/+/474496 Run-TryBot: Paul Murphy Reviewed-by: Lynn Boger Reviewed-by: Heschi Kreinick Reviewed-by: Archana Ravindar TryBot-Result: Gopher Robot Reviewed-by: Dmitri Shuralyov --- src/internal/bytealg/compare_ppc64x.s | 642 ++++++++++---------------- 1 file changed, 236 insertions(+), 406 deletions(-) diff --git a/src/internal/bytealg/compare_ppc64x.s b/src/internal/bytealg/compare_ppc64x.s index cbe0525af5..f3f8b4abd1 100644 --- a/src/internal/bytealg/compare_ppc64x.s +++ b/src/internal/bytealg/compare_ppc64x.s @@ -7,37 +7,62 @@ #include "go_asm.h" #include "textflag.h" +// Helper names for x-form loads in BE ordering. +#ifdef GOARCH_ppc64le +#define _LDBEX MOVDBR +#define _LWBEX MOVWBR +#define _LHBEX MOVHBR +#else +#define _LDBEX MOVD +#define _LWBEX MOVW +#define _LHBEX MOVH +#endif + +#ifdef GOPPC64_power9 +#define SETB_CR0(rout) SETB CR0, rout +#define SETB_CR1(rout) SETB CR1, rout +#define SETB_INIT() +#define SETB_CR0_NE(rout) SETB_CR0(rout) +#else +// A helper macro to emulate SETB on P8. This assumes +// -1 is in R20, and 1 is in R21. crxlt and crxeq must +// also be the same CR field. +#define _SETB(crxlt, crxeq, rout) \ + ISEL crxeq,R0,R21,rout \ + ISEL crxlt,R20,rout,rout + +// A special case when it is know the comparison +// will always be not equal. The result must be -1 or 1. +#define SETB_CR0_NE(rout) \ + ISEL CR0LT,R20,R21,rout + +#define SETB_CR0(rout) _SETB(CR0LT, CR0EQ, rout) +#define SETB_CR1(rout) _SETB(CR1LT, CR1EQ, rout) +#define SETB_INIT() \ + MOVD $-1,R20 \ + MOVD $1,R21 +#endif + TEXT ·Compare(SB),NOSPLIT|NOFRAME,$0-56 // incoming: - // R3 a addr -> R5 - // R4 a len -> R3 - // R5 a cap unused - // R6 b addr -> R6 - // R7 b len -> R4 - // R8 b cap unused - MOVD R3, R5 - MOVD R4, R3 - MOVD R7, R4 - CMP R5,R6,CR7 - CMP R3,R4,CR6 - BEQ CR7,equal - MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16 - CMP R16,$1 - BNE power8 - BR cmpbodyp9<>(SB) -power8: + // R3 a addr + // R4 a len + // R6 b addr + // R7 b len + // + // on entry to cmpbody: + // R3 return value if len(a) == len(b) + // R5 a addr + // R6 b addr + // R9 min(len(a),len(b)) + SETB_INIT() + MOVD R3,R5 + CMP R4,R7,CR0 + CMP R3,R6,CR7 + ISEL CR0LT,R4,R7,R9 + SETB_CR0(R3) + BC $12,30,LR // beqlr cr7 BR cmpbody<>(SB) -equal: - BEQ CR6,done - MOVD $1, R8 - BGT CR6,greater - NEG R8 -greater: - MOVD R8, R3 - RET -done: - MOVD $0, R3 - RET TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40 // incoming: @@ -45,32 +70,21 @@ TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40 // R4 a len -> R3 // R5 b addr -> R6 // R6 b len -> R4 - MOVD R6, R7 - MOVD R5, R6 - MOVD R3, R5 - MOVD R4, R3 - MOVD R7, R4 - CMP R5,R6,CR7 - CMP R3,R4,CR6 - BEQ CR7,equal - MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16 - CMP R16,$1 - BNE power8 - BR cmpbodyp9<>(SB) -power8: + // + // on entry to cmpbody: + // R3 compare value if compared length is same. + // R5 a addr + // R6 b addr + // R9 min(len(a),len(b)) + SETB_INIT() + CMP R4,R6,CR0 + CMP R3,R5,CR7 + ISEL CR0LT,R4,R6,R9 + MOVD R5,R6 + MOVD R3,R5 + SETB_CR0(R3) + BC $12,30,LR // beqlr cr7 BR cmpbody<>(SB) -equal: - BEQ CR6,done - MOVD $1, R8 - BGT CR6,greater - NEG R8 -greater: - MOVD R8, R3 - RET - -done: - MOVD $0, R3 - RET #ifdef GOARCH_ppc64le DATA byteswap<>+0(SB)/8, $0x0706050403020100 @@ -79,32 +93,33 @@ GLOBL byteswap<>+0(SB), RODATA, $16 #define SWAP V21 #endif -// Do an efficient memcmp for ppc64le/ppc64/POWER8 -// R3 = a len -// R4 = b len -// R5 = a addr -// R6 = b addr -// On exit: -// R3 = return value TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0 - MOVD R3,R8 // set up length - CMP R3,R4,CR2 // unequal? - BLT CR2,setuplen // BLT CR2 - MOVD R4,R8 // use R4 for comparison len -setuplen: - CMP R8,$32 // optimize >= 32 - MOVD R8,R9 - BLT setup8a // optimize < 32 - MOVD $16,R10 // set offsets to load into vectors - CMP R8,$64 - BLT cmp32 // process size 32-63 - - DCBT (R5) // optimize >= 64 +start: + CMP R9,$16,CR0 + CMP R9,$32,CR1 + CMP R9,$64,CR2 + MOVD $16,R10 + BLT cmp8 + BLT CR1,cmp16 + BLT CR2,cmp32 + +cmp64: // >= 64B + DCBT (R5) // optimize for size>=64 DCBT (R6) // cache hint + + SRD $6,R9,R14 // There is at least one iteration. + MOVD R14,CTR + ANDCC $63,R9,R9 + CMP R9,$16,CR1 // Do setup for tail check early on. + CMP R9,$32,CR2 + CMP R9,$48,CR3 + ADD $-16,R9,R9 + MOVD $32,R11 // set offsets to load into vector MOVD $48,R12 // set offsets to load into vector -loop64a:// process size 64 and greater + PCALIGN $32 +cmp64_loop: LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector VCMPEQUDCC V3,V4,V1 @@ -112,391 +127,206 @@ loop64a:// process size 64 and greater LXVD2X (R5)(R10),V3 // load bytes of A at offset 16 into vector LXVD2X (R6)(R10),V4 // load bytes of B at offset 16 into vector - VCMPEQUDCC V3,V4,V1 BGE CR6,different LXVD2X (R5)(R11),V3 // load bytes of A at offset 32 into vector LXVD2X (R6)(R11),V4 // load bytes of B at offset 32 into vector - VCMPEQUDCC V3,V4,V1 BGE CR6,different LXVD2X (R5)(R12),V3 // load bytes of A at offset 64 into vector LXVD2X (R6)(R12),V4 // load bytes of B at offset 64 into vector - VCMPEQUDCC V3,V4,V1 BGE CR6,different - ADD $-64,R9,R9 // reduce remaining size by 64 ADD $64,R5,R5 // increment to next 64 bytes of A ADD $64,R6,R6 // increment to next 64 bytes of B - CMPU R9,$64 - BGE loop64a // loop back to loop64a only if there are >= 64 bytes remaining - - CMPU R9,$32 - BGE cmp32 // loop to cmp32 if there are 32-64 bytes remaining - CMPU R9,$0 - BNE rem // loop to rem if the remainder is not 0 - - BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) - BLT CR2,less // jump to less if len(A)+00(SB), R16 - LXVD2X (R16)(R0),SWAP // Set up swap string + BR cmp64_tail_gt0 - VPERM V3,V3,SWAP,V3 - VPERM V4,V4,SWAP,V4 -#endif - MFVSRD VS35,R16 // move upper doublwords of A and B into GPR for comparison - MFVSRD VS36,R10 + PCALIGN $16 +cmp64_tail_gt16: // 17 - 32B + LXVD2X (R0)(R5),V3 + LXVD2X (R0)(R6),V4 + VCMPEQUDCC V3,V4,V1 + BGE CR6,different - CMPU R16,R10 - BEQ lower - BGT greater - MOVD $-1,R3 // return value if A < B - RET -lower: - VSLDOI $8,V3,V3,V3 // move lower doublwords of A and B into GPR for comparison - MFVSRD VS35,R16 - VSLDOI $8,V4,V4,V4 - MFVSRD VS36,R10 + BR cmp64_tail_gt0 + + PCALIGN $16 +cmp64_tail_gt0: // 1 - 16B + LXVD2X (R5)(R9),V3 + LXVD2X (R6)(R9),V4 + VCMPEQUDCC V3,V4,V1 + BGE CR6,different - CMPU R16,R10 - BGT greater - MOVD $-1,R3 // return value if A < B RET -setup8a: - SRADCC $3,R8,R9 // get the 8 byte count - BEQ leftover // shifted value is 0 - CMPU R8,$8 // optimize 8byte move - BEQ size8 - CMPU R8,$16 - BEQ size16 - MOVD R9,CTR // loop count for doublewords -loop8: -#ifdef GOARCH_ppc64le - MOVDBR (R5+R0),R16 // doublewords to compare - MOVDBR (R6+R0),R10 // LE compare order -#else - MOVD (R5+R0),R16 // doublewords to compare - MOVD (R6+R0),R10 // BE compare order -#endif - ADD $8,R5 - ADD $8,R6 - CMPU R16,R10 // match? - BC 8,2,loop8 // bt ctr <> 0 && cr - BGT greater - BLT less -leftover: - ANDCC $7,R8,R9 // check for leftover bytes - BEQ zeroremainder -simplecheck: - MOVD R0,R14 - CMP R9,$4 // process 4 bytes - BLT halfword -#ifdef GOARCH_ppc64le - MOVWBR (R5)(R14),R10 - MOVWBR (R6)(R14),R11 -#else - MOVWZ (R5)(R14),R10 - MOVWZ (R6)(R14),R11 -#endif - CMPU R10,R11 - BGT greater - BLT less - ADD $-4,R9 - ADD $4,R14 - PCALIGN $16 -halfword: - CMP R9,$2 // process 2 bytes - BLT byte -#ifdef GOARCH_ppc64le - MOVHBR (R5)(R14),R10 - MOVHBR (R6)(R14),R11 -#else - MOVHZ (R5)(R14),R10 - MOVHZ (R6)(R14),R11 -#endif - CMPU R10,R11 - BGT greater - BLT less - ADD $-2,R9 - ADD $2,R14 - PCALIGN $16 -byte: - CMP R9,$0 // process 1 byte - BEQ skip - MOVBZ (R5)(R14),R10 - MOVBZ (R6)(R14),R11 - CMPU R10,R11 - BGT greater - BLT less - PCALIGN $16 -skip: - BEQ CR2,equal - BGT CR2,greater + PCALIGN $16 +cmp32: // 32 - 63B + ANDCC $31,R9,R9 -less: MOVD $-1,R3 // return value if A < B - RET -size16: - LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector - LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector + LXVD2X (R0)(R5),V3 + LXVD2X (R0)(R6),V4 VCMPEQUDCC V3,V4,V1 BGE CR6,different -zeroremainder: - BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) - BLT CR2,less // jump to less if len(A) 1st len - BLT CR2,less // 2nd len < 1st len -equal: - MOVD $0, R3 // return value if A == B - RET -greater: - MOVD $1,R3 // return value if A > B - RET -// Do an efficient memcmp for ppc64le/ppc64/POWER9 -// R3 = a len -// R4 = b len -// R5 = a addr -// R6 = b addr -// On exit: -// R3 = return value -TEXT cmpbodyp9<>(SB),NOSPLIT|NOFRAME,$0-0 - MOVD R3,R8 // set up length - CMP R3,R4,CR2 // unequal? - BLT CR2,setuplen // BLT CR2 - MOVD R4,R8 // use R4 for comparison len -setuplen: - CMP R8,$16 // optimize for size<16 - MOVD R8,R9 - BLT simplecheck - MOVD $16,R10 // set offsets to load into vectors - CMP R8,$32 // optimize for size 16-31 - BLT cmp16 - CMP R8,$64 - BLT cmp32 // optimize for size 32-63 - DCBT (R5) // optimize for size>=64 - DCBT (R6) // cache hint + LXVD2X (R10)(R5),V3 + LXVD2X (R10)(R6),V4 + VCMPEQUDCC V3,V4,V1 + BGE CR6,different - MOVD $32,R11 // set offsets to load into vector - MOVD $48,R12 // set offsets to load into vector + BC $12,2,LR // beqlr + ADD R9,R10,R10 -loop64a:// process size 64 and greater - LXVB16X (R0)(R5),V3 // load bytes of A at offset 0 into vector - LXVB16X (R0)(R6),V4 // load bytes of B at offset 0 into vector - VCMPNEBCC V3,V4,V1 // record comparison into V1 - BNE CR6,different // jump out if its different + LXVD2X (R9)(R5),V3 + LXVD2X (R9)(R6),V4 + VCMPEQUDCC V3,V4,V1 + BGE CR6,different - LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector - LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector - VCMPNEBCC V3,V4,V1 - BNE CR6,different + LXVD2X (R10)(R5),V3 + LXVD2X (R10)(R6),V4 + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + RET - LXVB16X (R11)(R5),V3 // load bytes of A at offset 32 into vector - LXVB16X (R11)(R6),V4 // load bytes of B at offset 32 into vector - VCMPNEBCC V3,V4,V1 - BNE CR6,different + PCALIGN $16 +cmp16: // 16 - 31B + ANDCC $15,R9,R9 + LXVD2X (R0)(R5),V3 + LXVD2X (R0)(R6),V4 + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + BC $12,2,LR // beqlr - LXVB16X (R12)(R5),V3 // load bytes of A at offset 48 into vector - LXVB16X (R12)(R6),V4 // load bytes of B at offset 48 into vector - VCMPNEBCC V3,V4,V1 - BNE CR6,different + LXVD2X (R9)(R5),V3 + LXVD2X (R9)(R6),V4 + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + RET - ADD $-64,R9,R9 // reduce remaining size by 64 - ADD $64,R5,R5 // increment to next 64 bytes of A - ADD $64,R6,R6 // increment to next 64 bytes of B - CMPU R9,$64 - BGE loop64a // loop back to loop64a only if there are >= 64 bytes remaining - - CMPU R9,$32 - BGE cmp32 // loop to cmp32 if there are 32-64 bytes remaining - CMPU R9,$16 - BGE cmp16 // loop to cmp16 if there are 16-31 bytes left - CMPU R9,$0 - BNE simplecheck // loop to simplecheck for remaining bytes - - BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) - BLT CR2,less // jump to less if len(A)+00(SB),R16 + LXVD2X (R16)(R0),SWAP // Set up swap string - MFVSRD VS35,R16 // move upper doublwords of A and B into GPR for comparison + VPERM V3,V3,SWAP,V3 + VPERM V4,V4,SWAP,V4 +#endif + + MFVSRD VS35,R16 // move upper doublewords of A and B into GPR for comparison MFVSRD VS36,R10 CMPU R16,R10 BEQ lower - BGT greater - MOVD $-1,R3 // return value if A < B + SETB_CR0_NE(R3) RET + + PCALIGN $16 lower: - MFVSRLD VS35,R16 // next move lower doublewords of A and B into GPR for comparison - MFVSRLD VS36,R10 + VSLDOI $8,V3,V3,V3 // move lower doublewords of A and B into GPR for comparison + MFVSRD VS35,R16 + VSLDOI $8,V4,V4,V4 + MFVSRD VS36,R10 CMPU R16,R10 - BGT greater - MOVD $-1,R3 // return value if A < B + SETB_CR0_NE(R3) RET -greater: - MOVD $1,R3 // return value if A > B + PCALIGN $16 +cmp8: // 8 - 15B + CMP R9,$8 + BLT cmp4 + ANDCC $7,R9,R9 + _LDBEX (R0)(R5),R10 + _LDBEX (R0)(R6),R11 + _LDBEX (R9)(R5),R12 + _LDBEX (R9)(R6),R14 + CMPU R10,R11,CR0 + SETB_CR0(R5) + CMPU R12,R14,CR1 + SETB_CR1(R6) + CRAND CR0EQ,CR1EQ,CR1EQ // If both equal, length determines return value. + ISEL CR0EQ,R6,R5,R4 + ISEL CR1EQ,R3,R4,R3 RET -cmp16: - ANDCC $16,R9,R31 - BEQ tail - - LXVB16X (R0)(R5),V3 // load bytes of A at offset 16 into vector - LXVB16X (R0)(R6),V4 // load bytes of B at offset 16 into vector - VCMPEQUDCC V3,V4,V1 - BGE CR6,different - - ADD $16,R5 - ADD $16,R6 -tail: - ANDCC $15,R9 // Load the last 16 bytes (we know there are at least 32b) - BEQ end - - ADD R9,R5 - ADD R9,R6 - MOVD $-16,R10 - LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector - LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector - VCMPEQUDCC V3,V4,V1 - BGE CR6,different -end: - BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) - BLT CR2,less // jump to less if BLT CR2 that is, len(A)