]> Cypherpunks repositories - gostls13.git/commitdiff
internal/bytealg: rewrite PPC64 Compare
authorPaul E. Murphy <murp@ibm.com>
Mon, 27 Feb 2023 22:04:50 +0000 (16:04 -0600)
committerPaul Murphy <murp@ibm.com>
Tue, 21 Mar 2023 13:10:36 +0000 (13:10 +0000)
Merge the P8 and P9 paths into one. This removes the need for
a runtime CPU check and maintaining two separate code paths.

This takes advantage of overlapping checks, and the P9 SETB
(emulated with little overhead on P8) to speed up comparisons
of small strings.

Similarly, the SETB instruction can be used on GOPPC64=power9
which provides a small speedup over using a couple ISELs. This
only accounts for a few percent on very small strings, thus
results of running P8 codegen on P9 are left out.

For the baseline on a power8 machine:

BytesCompare/1     7.76ns ± 0%  6.38ns ± 0%  -17.71%
BytesCompare/2     7.77ns ± 0%  6.36ns ± 0%  -18.12%
BytesCompare/3     7.56ns ± 0%  6.36ns ± 0%  -15.79%
BytesCompare/4     7.76ns ± 0%  5.74ns ± 0%  -25.99%
BytesCompare/5     7.48ns ± 0%  5.74ns ± 0%  -23.29%
BytesCompare/6     7.56ns ± 0%  5.74ns ± 0%  -24.06%
BytesCompare/7     7.14ns ± 0%  5.74ns ± 0%  -19.63%
BytesCompare/8     5.58ns ± 0%  5.19ns ± 0%   -7.03%
BytesCompare/9     7.85ns ± 0%  5.19ns ± 0%  -33.86%
BytesCompare/10    7.87ns ± 0%  5.19ns ± 0%  -34.06%
BytesCompare/11    7.59ns ± 0%  5.19ns ± 0%  -31.59%
BytesCompare/12    7.87ns ± 0%  5.19ns ± 0%  -34.02%
BytesCompare/13    7.55ns ± 0%  5.19ns ± 0%  -31.24%
BytesCompare/14    7.47ns ± 0%  5.19ns ± 0%  -30.53%
BytesCompare/15    7.88ns ± 0%  5.19ns ± 0%  -34.09%
BytesCompare/16    6.07ns ± 0%  5.58ns ± 0%   -8.08%
BytesCompare/17    9.05ns ± 0%  5.62ns ± 0%  -37.94%
BytesCompare/18    8.95ns ± 0%  5.62ns ± 0%  -37.24%
BytesCompare/19    8.49ns ± 0%  5.62ns ± 0%  -33.81%
BytesCompare/20    9.07ns ± 0%  5.62ns ± 0%  -38.05%
BytesCompare/21    8.69ns ± 0%  5.62ns ± 0%  -35.37%
BytesCompare/22    8.57ns ± 0%  5.62ns ± 0%  -34.43%
BytesCompare/23    8.31ns ± 0%  5.62ns ± 0%  -32.38%
BytesCompare/24    8.42ns ± 0%  5.62ns ± 0%  -33.23%
BytesCompare/25    9.70ns ± 0%  5.56ns ± 0%  -42.69%
BytesCompare/26    9.53ns ± 0%  5.56ns ± 0%  -41.66%
BytesCompare/27    9.29ns ± 0%  5.56ns ± 0%  -40.15%
BytesCompare/28    9.53ns ± 0%  5.56ns ± 0%  -41.65%
BytesCompare/29    9.37ns ± 0%  5.56ns ± 0%  -40.63%
BytesCompare/30    9.17ns ± 0%  5.56ns ± 0%  -39.36%
BytesCompare/31    9.07ns ± 0%  5.56ns ± 0%  -38.71%
BytesCompare/32    5.81ns ± 0%  5.49ns ± 0%   -5.49%
BytesCompare/33    9.36ns ± 0%  5.32ns ± 0%  -43.17%
BytesCompare/34    9.44ns ± 0%  5.32ns ± 0%  -43.68%
BytesCompare/35    8.91ns ± 0%  5.32ns ± 0%  -40.29%
BytesCompare/36    9.45ns ± 0%  5.32ns ± 0%  -43.71%
BytesCompare/37    8.94ns ± 0%  5.32ns ± 0%  -40.53%
BytesCompare/38    9.08ns ± 0%  5.32ns ± 0%  -41.44%
BytesCompare/39    8.62ns ± 0%  5.32ns ± 0%  -38.33%
BytesCompare/40    7.93ns ± 0%  5.32ns ± 0%  -32.93%
BytesCompare/41    10.1ns ± 0%   5.3ns ± 0%  -47.08%
BytesCompare/42    10.1ns ± 0%   5.3ns ± 0%  -47.43%
BytesCompare/43    9.80ns ± 0%  5.32ns ± 0%  -45.66%
BytesCompare/44    10.3ns ± 0%   5.3ns ± 0%  -48.26%
BytesCompare/45    9.88ns ± 0%  5.33ns ± 0%  -46.08%
BytesCompare/46    9.82ns ± 0%  5.32ns ± 0%  -45.81%
BytesCompare/47    9.73ns ± 0%  5.33ns ± 0%  -45.25%
BytesCompare/48    8.31ns ± 0%  5.22ns ± 0%  -37.19%
BytesCompare/49    11.2ns ± 0%   5.2ns ± 0%  -53.28%
BytesCompare/50    11.1ns ± 0%   5.2ns ± 0%  -52.86%
BytesCompare/51    10.8ns ± 0%   5.2ns ± 0%  -51.37%
BytesCompare/52    11.1ns ± 0%   5.2ns ± 0%  -52.94%
BytesCompare/53    10.8ns ± 0%   5.2ns ± 0%  -51.50%
BytesCompare/54    10.7ns ± 0%   5.2ns ± 0%  -51.09%
BytesCompare/55    10.3ns ± 0%   5.2ns ± 0%  -49.49%
BytesCompare/56    10.9ns ± 0%   5.2ns ± 0%  -51.73%
BytesCompare/57    12.2ns ± 0%   5.3ns ± 0%  -56.92%
BytesCompare/58    12.2ns ± 0%   5.3ns ± 0%  -56.81%
BytesCompare/59    11.5ns ± 0%   5.3ns ± 0%  -54.45%
BytesCompare/60    12.1ns ± 0%   5.3ns ± 0%  -56.67%
BytesCompare/61    11.7ns ± 0%   5.3ns ± 0%  -54.96%
BytesCompare/62    11.9ns ± 0%   5.3ns ± 0%  -55.76%
BytesCompare/63    11.4ns ± 0%   5.3ns ± 0%  -53.73%
BytesCompare/64    6.08ns ± 0%  5.47ns ± 0%   -9.96%
BytesCompare/65    9.87ns ± 0%  5.96ns ± 0%  -39.57%
BytesCompare/66    9.81ns ± 0%  5.96ns ± 0%  -39.25%
BytesCompare/67    9.49ns ± 0%  5.96ns ± 0%  -37.18%
BytesCompare/68    9.81ns ± 0%  5.96ns ± 0%  -39.26%
BytesCompare/69    9.44ns ± 0%  5.96ns ± 0%  -36.84%
BytesCompare/70    9.58ns ± 0%  5.96ns ± 0%  -37.75%
BytesCompare/71    9.24ns ± 0%  5.96ns ± 0%  -35.50%
BytesCompare/72    8.26ns ± 0%  5.94ns ± 0%  -28.09%
BytesCompare/73    10.6ns ± 0%   5.9ns ± 0%  -43.70%
BytesCompare/74    10.6ns ± 0%   5.9ns ± 0%  -43.87%
BytesCompare/75    10.2ns ± 0%   5.9ns ± 0%  -41.83%
BytesCompare/76    10.7ns ± 0%   5.9ns ± 0%  -44.55%
BytesCompare/77    10.3ns ± 0%   5.9ns ± 0%  -42.51%
BytesCompare/78    10.3ns ± 0%   5.9ns ± 0%  -42.29%
BytesCompare/79    10.2ns ± 0%   5.9ns ± 0%  -41.95%
BytesCompare/80    8.74ns ± 0%  5.93ns ± 0%  -32.23%
BytesCompare/81    11.7ns ± 0%   6.8ns ± 0%  -41.87%
BytesCompare/82    11.7ns ± 0%   6.8ns ± 0%  -41.54%
BytesCompare/83    11.1ns ± 0%   6.8ns ± 0%  -38.32%
BytesCompare/84    11.7ns ± 0%   6.8ns ± 0%  -41.59%
BytesCompare/85    11.2ns ± 0%   6.8ns ± 0%  -38.93%
BytesCompare/86    11.2ns ± 0%   6.8ns ± 0%  -38.87%
BytesCompare/87    10.8ns ± 0%   6.8ns ± 0%  -37.07%
BytesCompare/88    11.3ns ± 0%   6.7ns ± 0%  -40.57%
BytesCompare/89    12.6ns ± 0%   6.7ns ± 0%  -46.57%
BytesCompare/90    12.6ns ± 0%   6.7ns ± 0%  -46.44%
BytesCompare/91    11.9ns ± 0%   6.7ns ± 0%  -43.66%
BytesCompare/92    12.5ns ± 0%   6.7ns ± 0%  -46.09%
BytesCompare/93    12.2ns ± 0%   6.7ns ± 0%  -44.90%
BytesCompare/94    12.4ns ± 0%   6.7ns ± 0%  -45.62%
BytesCompare/95    11.8ns ± 0%   6.7ns ± 0%  -43.00%
BytesCompare/96    7.25ns ± 0%  6.62ns ± 0%   -8.70%
BytesCompare/97    11.1ns ± 0%   7.2ns ± 0%  -34.98%
BytesCompare/98    10.9ns ± 0%   7.2ns ± 0%  -34.03%
BytesCompare/99    10.4ns ± 0%   7.2ns ± 0%  -31.19%
BytesCompare/100   10.9ns ± 0%   7.2ns ± 0%  -33.97%
BytesCompare/101   10.4ns ± 0%   7.2ns ± 0%  -31.19%
BytesCompare/102   10.7ns ± 0%   7.2ns ± 0%  -32.72%
BytesCompare/103   10.2ns ± 0%   7.2ns ± 0%  -29.28%
BytesCompare/104   9.38ns ± 0%  7.19ns ± 0%  -23.33%
BytesCompare/105   11.7ns ± 0%   7.2ns ± 0%  -38.60%
BytesCompare/106   11.7ns ± 0%   7.2ns ± 0%  -38.28%
BytesCompare/107   11.3ns ± 0%   7.2ns ± 0%  -36.48%
BytesCompare/108   11.7ns ± 0%   7.2ns ± 0%  -38.49%
BytesCompare/109   11.4ns ± 0%   7.2ns ± 0%  -36.76%
BytesCompare/110   11.3ns ± 0%   7.2ns ± 0%  -36.37%
BytesCompare/111   11.1ns ± 0%   7.2ns ± 0%  -35.05%
BytesCompare/112   9.95ns ± 0%  7.19ns ± 0%  -27.71%
BytesCompare/113   12.7ns ± 0%   7.0ns ± 0%  -44.71%
BytesCompare/114   12.6ns ± 0%   7.0ns ± 0%  -44.23%
BytesCompare/115   12.3ns ± 0%   7.0ns ± 0%  -42.83%
BytesCompare/116   12.7ns ± 0%   7.0ns ± 0%  -44.67%
BytesCompare/117   12.2ns ± 0%   7.0ns ± 0%  -42.41%
BytesCompare/118   12.2ns ± 0%   7.0ns ± 0%  -42.50%
BytesCompare/119   11.9ns ± 0%   7.0ns ± 0%  -40.76%
BytesCompare/120   12.3ns ± 0%   7.0ns ± 0%  -43.01%
BytesCompare/121   13.7ns ± 0%   7.0ns ± 0%  -48.55%
BytesCompare/122   13.6ns ± 0%   7.0ns ± 0%  -48.06%
BytesCompare/123   12.9ns ± 0%   7.0ns ± 0%  -45.44%
BytesCompare/124   13.5ns ± 0%   7.0ns ± 0%  -47.91%
BytesCompare/125   13.0ns ± 0%   7.0ns ± 0%  -46.03%
BytesCompare/126   13.2ns ± 0%   7.0ns ± 0%  -46.72%
BytesCompare/127   12.9ns ± 0%   7.0ns ± 0%  -45.36%
BytesCompare/128   7.53ns ± 0%  6.78ns ± 0%   -9.95%
BytesCompare/256   10.1ns ± 0%   9.6ns ± 0%   -4.35%
BytesCompare/512   23.0ns ± 0%  15.3ns ± 0%  -33.30%
BytesCompare/1024  36.4ns ± 0%  32.8ns ± 0%   -9.83%
BytesCompare/2048  62.0ns ± 0%  56.0ns ± 0%   -9.77%

For GOPPC64=power9 on power9:

BytesCompare/1     5.95ns ± 0%  4.83ns ± 0%  -18.89%
BytesCompare/2     6.37ns ± 0%  4.69ns ± 0%  -26.39%
BytesCompare/3     6.87ns ± 0%  4.68ns ± 0%  -31.79%
BytesCompare/4     5.86ns ± 0%  4.63ns ± 0%  -20.98%
BytesCompare/5     5.84ns ± 0%  4.63ns ± 0%  -20.67%
BytesCompare/6     5.84ns ± 0%  4.63ns ± 0%  -20.70%
BytesCompare/7     5.82ns ± 0%  4.63ns ± 0%  -20.40%
BytesCompare/8     5.81ns ± 0%  4.64ns ± 0%  -20.23%
BytesCompare/9     5.83ns ± 0%  4.71ns ± 0%  -19.19%
BytesCompare/10    6.22ns ± 0%  4.71ns ± 0%  -24.32%
BytesCompare/11    6.94ns ± 0%  4.71ns ± 0%  -32.16%
BytesCompare/12    5.77ns ± 0%  4.71ns ± 0%  -18.34%
BytesCompare/13    5.77ns ± 0%  4.71ns ± 0%  -18.44%
BytesCompare/14    5.77ns ± 0%  4.71ns ± 0%  -18.31%
BytesCompare/15    6.31ns ± 0%  4.71ns ± 0%  -25.32%
BytesCompare/16    4.99ns ± 0%  5.03ns ± 0%   +0.72%
BytesCompare/17    5.07ns ± 0%  5.03ns ± 0%   -0.87%
BytesCompare/18    5.07ns ± 0%  5.03ns ± 0%   -0.81%
BytesCompare/19    5.07ns ± 0%  5.03ns ± 0%   -0.85%
BytesCompare/20    5.07ns ± 0%  5.03ns ± 0%   -0.73%
BytesCompare/21    5.07ns ± 0%  5.03ns ± 0%   -0.81%
BytesCompare/22    5.07ns ± 0%  5.03ns ± 0%   -0.77%
BytesCompare/23    5.07ns ± 0%  5.03ns ± 0%   -0.75%
BytesCompare/24    5.08ns ± 0%  5.07ns ± 0%   -0.12%
BytesCompare/25    5.03ns ± 0%  5.00ns ± 0%   -0.60%
BytesCompare/26    5.02ns ± 0%  5.00ns ± 0%   -0.56%
BytesCompare/27    5.03ns ± 0%  5.00ns ± 0%   -0.60%
BytesCompare/28    5.03ns ± 0%  5.00ns ± 0%   -0.72%
BytesCompare/29    5.03ns ± 0%  5.00ns ± 0%   -0.68%
BytesCompare/30    5.03ns ± 0%  5.00ns ± 0%   -0.76%
BytesCompare/31    5.03ns ± 0%  5.00ns ± 0%   -0.60%
BytesCompare/32    5.02ns ± 0%  5.05ns ± 0%   +0.56%
BytesCompare/33    6.78ns ± 0%  5.16ns ± 0%  -23.84%
BytesCompare/34    7.26ns ± 0%  5.16ns ± 0%  -28.93%
BytesCompare/35    7.78ns ± 0%  5.16ns ± 0%  -33.65%
BytesCompare/36    6.72ns ± 0%  5.16ns ± 0%  -23.24%
BytesCompare/37    7.32ns ± 0%  5.16ns ± 0%  -29.55%
BytesCompare/38    7.26ns ± 0%  5.16ns ± 0%  -28.95%
BytesCompare/39    7.99ns ± 0%  5.16ns ± 0%  -35.40%
BytesCompare/40    6.67ns ± 0%  5.11ns ± 0%  -23.41%
BytesCompare/41    7.25ns ± 0%  5.14ns ± 0%  -29.05%
BytesCompare/42    7.47ns ± 0%  5.14ns ± 0%  -31.11%
BytesCompare/43    7.97ns ± 0%  5.14ns ± 0%  -35.42%
BytesCompare/44    7.29ns ± 0%  5.14ns ± 0%  -29.38%
BytesCompare/45    8.06ns ± 0%  5.14ns ± 0%  -36.20%
BytesCompare/46    7.89ns ± 0%  5.14ns ± 0%  -34.77%
BytesCompare/47    8.59ns ± 0%  5.14ns ± 0%  -40.13%
BytesCompare/48    5.57ns ± 0%  5.12ns ± 0%   -8.18%
BytesCompare/49    6.05ns ± 0%  5.17ns ± 0%  -14.48%
BytesCompare/50    6.05ns ± 0%  5.17ns ± 0%  -14.51%
BytesCompare/51    6.06ns ± 0%  5.17ns ± 0%  -14.61%
BytesCompare/52    6.05ns ± 0%  5.17ns ± 0%  -14.54%
BytesCompare/53    6.06ns ± 0%  5.17ns ± 0%  -14.56%
BytesCompare/54    6.05ns ± 0%  5.17ns ± 0%  -14.54%
BytesCompare/55    6.05ns ± 0%  5.17ns ± 0%  -14.54%
BytesCompare/56    6.02ns ± 0%  5.11ns ± 0%  -15.13%
BytesCompare/57    6.01ns ± 0%  5.14ns ± 0%  -14.56%
BytesCompare/58    6.02ns ± 0%  5.14ns ± 0%  -14.59%
BytesCompare/59    6.02ns ± 0%  5.14ns ± 0%  -14.65%
BytesCompare/60    6.03ns ± 0%  5.14ns ± 0%  -14.71%
BytesCompare/61    6.02ns ± 0%  5.14ns ± 0%  -14.69%
BytesCompare/62    6.01ns ± 0%  5.14ns ± 0%  -14.55%
BytesCompare/63    6.02ns ± 0%  5.14ns ± 0%  -14.65%
BytesCompare/64    6.09ns ± 0%  5.15ns ± 0%  -15.34%
BytesCompare/65    7.83ns ± 0%  5.93ns ± 0%  -24.17%
BytesCompare/66    7.86ns ± 0%  5.93ns ± 0%  -24.52%
BytesCompare/67    8.56ns ± 0%  5.93ns ± 0%  -30.68%
BytesCompare/68    7.90ns ± 0%  5.93ns ± 0%  -24.88%
BytesCompare/69    8.58ns ± 0%  5.93ns ± 0%  -30.84%
BytesCompare/70    8.54ns ± 0%  5.93ns ± 0%  -30.48%
BytesCompare/71    9.18ns ± 0%  5.94ns ± 0%  -35.34%
BytesCompare/72    7.89ns ± 0%  5.86ns ± 0%  -25.76%
BytesCompare/73    8.59ns ± 0%  5.82ns ± 0%  -32.25%
BytesCompare/74    8.52ns ± 0%  5.82ns ± 0%  -31.61%
BytesCompare/75    9.17ns ± 0%  5.82ns ± 0%  -36.50%
BytesCompare/76    8.54ns ± 0%  5.82ns ± 0%  -31.85%
BytesCompare/77    9.25ns ± 0%  5.82ns ± 0%  -37.07%
BytesCompare/78    9.17ns ± 0%  5.82ns ± 0%  -36.48%
BytesCompare/79    10.0ns ± 0%   5.8ns ± 0%  -41.66%
BytesCompare/80    6.76ns ± 0%  5.69ns ± 0%  -15.90%
BytesCompare/81    7.63ns ± 0%  6.70ns ± 0%  -12.23%
BytesCompare/82    7.63ns ± 0%  6.70ns ± 0%  -12.23%
BytesCompare/83    7.63ns ± 0%  6.70ns ± 0%  -12.24%
BytesCompare/84    7.63ns ± 0%  6.70ns ± 0%  -12.24%
BytesCompare/85    7.63ns ± 0%  6.70ns ± 0%  -12.23%
BytesCompare/86    7.63ns ± 0%  6.70ns ± 0%  -12.24%
BytesCompare/87    7.63ns ± 0%  6.70ns ± 0%  -12.24%
BytesCompare/88    7.53ns ± 0%  6.56ns ± 0%  -12.90%
BytesCompare/89    7.53ns ± 0%  6.55ns ± 0%  -12.93%
BytesCompare/90    7.53ns ± 0%  6.55ns ± 0%  -12.93%
BytesCompare/91    7.53ns ± 0%  6.55ns ± 0%  -12.93%
BytesCompare/92    7.53ns ± 0%  6.55ns ± 0%  -12.93%
BytesCompare/93    7.53ns ± 0%  6.55ns ± 0%  -12.93%
BytesCompare/94    7.53ns ± 0%  6.55ns ± 0%  -12.93%
BytesCompare/95    7.53ns ± 0%  6.55ns ± 0%  -12.94%
BytesCompare/96    7.02ns ± 0%  6.45ns ± 0%   -8.09%
BytesCompare/97    8.73ns ± 0%  7.39ns ± 0%  -15.35%
BytesCompare/98    8.71ns ± 0%  7.39ns ± 0%  -15.15%
BytesCompare/99    9.42ns ± 0%  7.39ns ± 0%  -21.57%
BytesCompare/100   8.73ns ± 0%  7.39ns ± 0%  -15.36%
BytesCompare/101   9.43ns ± 0%  7.39ns ± 0%  -21.70%
BytesCompare/102   9.42ns ± 0%  7.39ns ± 0%  -21.59%
BytesCompare/103   10.2ns ± 0%   7.4ns ± 0%  -27.58%
BytesCompare/104   8.74ns ± 0%  7.35ns ± 0%  -15.95%
BytesCompare/105   9.44ns ± 0%  7.30ns ± 0%  -22.67%
BytesCompare/106   9.44ns ± 0%  7.30ns ± 0%  -22.69%
BytesCompare/107   10.2ns ± 0%   7.3ns ± 0%  -28.53%
BytesCompare/108   9.48ns ± 0%  7.30ns ± 0%  -23.04%
BytesCompare/109   10.2ns ± 0%   7.3ns ± 0%  -28.81%
BytesCompare/110   10.2ns ± 0%   7.3ns ± 0%  -28.39%
BytesCompare/111   10.9ns ± 0%   7.3ns ± 0%  -33.18%
BytesCompare/112   7.75ns ± 0%  7.16ns ± 0%   -7.60%
BytesCompare/113   8.57ns ± 0%  7.83ns ± 0%   -8.60%
BytesCompare/114   8.57ns ± 0%  7.83ns ± 0%   -8.63%
BytesCompare/115   8.57ns ± 0%  7.83ns ± 0%   -8.56%
BytesCompare/116   8.57ns ± 0%  7.83ns ± 0%   -8.57%
BytesCompare/117   8.57ns ± 0%  7.83ns ± 0%   -8.56%
BytesCompare/118   8.57ns ± 0%  7.83ns ± 0%   -8.56%
BytesCompare/119   8.57ns ± 0%  7.83ns ± 0%   -8.61%
BytesCompare/120   8.46ns ± 0%  7.71ns ± 0%   -8.80%
BytesCompare/121   8.46ns ± 0%  7.72ns ± 0%   -8.77%
BytesCompare/122   8.46ns ± 0%  7.72ns ± 0%   -8.78%
BytesCompare/123   8.46ns ± 0%  7.72ns ± 0%   -8.76%
BytesCompare/124   8.46ns ± 0%  7.72ns ± 0%   -8.70%
BytesCompare/125   8.46ns ± 0%  7.72ns ± 0%   -8.70%
BytesCompare/126   8.46ns ± 0%  7.72ns ± 0%   -8.70%
BytesCompare/127   8.46ns ± 0%  7.72ns ± 0%   -8.71%
BytesCompare/128   8.19ns ± 0%  7.35ns ± 0%  -10.29%
BytesCompare/256   12.8ns ± 0%  11.4ns ± 0%  -11.23%
BytesCompare/512   22.2ns ± 0%  20.7ns ± 0%   -6.80%
BytesCompare/1024  41.1ns ± 0%  39.8ns ± 0%   -3.12%
BytesCompare/2048  86.5ns ± 0%  81.1ns ± 0%   -6.31%

Change-Id: I7c7fb1f7b891c23c6cade580e7b9928ca1a6efc3
Reviewed-on: https://go-review.googlesource.com/c/go/+/474496
Run-TryBot: Paul Murphy <murp@ibm.com>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Heschi Kreinick <heschi@google.com>
Reviewed-by: Archana Ravindar <aravind5@in.ibm.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
src/internal/bytealg/compare_ppc64x.s

index cbe0525af55d40e0e22a292d630098e25afa760a..f3f8b4abd167dfb0684480c11725529ffd2b8564 100644 (file)
@@ -7,37 +7,62 @@
 #include "go_asm.h"
 #include "textflag.h"
 
+// Helper names for x-form loads in BE ordering.
+#ifdef  GOARCH_ppc64le
+#define _LDBEX MOVDBR
+#define _LWBEX MOVWBR
+#define _LHBEX MOVHBR
+#else
+#define _LDBEX MOVD
+#define _LWBEX MOVW
+#define _LHBEX MOVH
+#endif
+
+#ifdef GOPPC64_power9
+#define SETB_CR0(rout) SETB CR0, rout
+#define SETB_CR1(rout) SETB CR1, rout
+#define SETB_INIT()
+#define SETB_CR0_NE(rout) SETB_CR0(rout)
+#else
+// A helper macro to emulate SETB on P8. This assumes
+// -1 is in R20, and 1 is in R21. crxlt and crxeq must
+// also be the same CR field.
+#define _SETB(crxlt, crxeq, rout) \
+       ISEL    crxeq,R0,R21,rout \
+       ISEL    crxlt,R20,rout,rout
+
+// A special case when it is know the comparison
+// will always be not equal. The result must be -1 or 1.
+#define SETB_CR0_NE(rout) \
+       ISEL    CR0LT,R20,R21,rout
+
+#define SETB_CR0(rout) _SETB(CR0LT, CR0EQ, rout)
+#define SETB_CR1(rout) _SETB(CR1LT, CR1EQ, rout)
+#define SETB_INIT() \
+       MOVD    $-1,R20 \
+       MOVD    $1,R21
+#endif
+
 TEXT ·Compare<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
        // incoming:
-       // R3 a addr -> R5
-       // R4 a len  -> R3
-       // R5 a cap unused
-       // R6 b addr -> R6
-       // R7 b len  -> R4
-       // R8 b cap unused
-       MOVD    R3, R5
-       MOVD    R4, R3
-       MOVD    R7, R4
-       CMP     R5,R6,CR7
-       CMP     R3,R4,CR6
-       BEQ     CR7,equal
-       MOVBZ   internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
-       CMP     R16,$1
-       BNE     power8
-       BR      cmpbodyp9<>(SB)
-power8:
+       // R3 a addr
+       // R4 a len
+       // R6 b addr
+       // R7 b len
+       //
+       // on entry to cmpbody:
+       // R3 return value if len(a) == len(b)
+       // R5 a addr
+       // R6 b addr
+       // R9 min(len(a),len(b))
+       SETB_INIT()
+       MOVD    R3,R5
+       CMP     R4,R7,CR0
+       CMP     R3,R6,CR7
+       ISEL    CR0LT,R4,R7,R9
+       SETB_CR0(R3)
+       BC      $12,30,LR       // beqlr cr7
        BR      cmpbody<>(SB)
-equal:
-       BEQ     CR6,done
-       MOVD    $1, R8
-       BGT     CR6,greater
-       NEG     R8
-greater:
-       MOVD    R8, R3
-       RET
-done:
-       MOVD    $0, R3
-       RET
 
 TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
        // incoming:
@@ -45,32 +70,21 @@ TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
        // R4 a len  -> R3
        // R5 b addr -> R6
        // R6 b len  -> R4
-       MOVD    R6, R7
-       MOVD    R5, R6
-       MOVD    R3, R5
-       MOVD    R4, R3
-       MOVD    R7, R4
-       CMP     R5,R6,CR7
-       CMP     R3,R4,CR6
-       BEQ     CR7,equal
-       MOVBZ   internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
-       CMP     R16,$1
-       BNE     power8
-       BR      cmpbodyp9<>(SB)
-power8:
+       //
+       // on entry to cmpbody:
+       // R3 compare value if compared length is same.
+       // R5 a addr
+       // R6 b addr
+       // R9 min(len(a),len(b))
+       SETB_INIT()
+       CMP     R4,R6,CR0
+       CMP     R3,R5,CR7
+       ISEL    CR0LT,R4,R6,R9
+       MOVD    R5,R6
+       MOVD    R3,R5
+       SETB_CR0(R3)
+       BC      $12,30,LR       // beqlr cr7
        BR      cmpbody<>(SB)
-equal:
-       BEQ     CR6,done
-       MOVD    $1, R8
-       BGT     CR6,greater
-       NEG     R8
-greater:
-       MOVD    R8, R3
-       RET
-
-done:
-       MOVD    $0, R3
-       RET
 
 #ifdef GOARCH_ppc64le
 DATA byteswap<>+0(SB)/8, $0x0706050403020100
@@ -79,32 +93,33 @@ GLOBL byteswap<>+0(SB), RODATA, $16
 #define SWAP V21
 #endif
 
-// Do an efficient memcmp for ppc64le/ppc64/POWER8
-// R3 = a len
-// R4 = b len
-// R5 = a addr
-// R6 = b addr
-// On exit:
-// R3 = return value
 TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
-       MOVD    R3,R8           // set up length
-       CMP     R3,R4,CR2       // unequal?
-       BLT     CR2,setuplen    // BLT CR2
-       MOVD    R4,R8           // use R4 for comparison len
-setuplen:
-       CMP     R8,$32          // optimize >= 32
-       MOVD    R8,R9
-       BLT     setup8a         // optimize < 32
-       MOVD    $16,R10         // set offsets to load into vectors
-       CMP     R8,$64
-       BLT     cmp32           // process size 32-63
-
-       DCBT    (R5)            // optimize >= 64
+start:
+       CMP     R9,$16,CR0
+       CMP     R9,$32,CR1
+       CMP     R9,$64,CR2
+       MOVD    $16,R10
+       BLT     cmp8
+       BLT     CR1,cmp16
+       BLT     CR2,cmp32
+
+cmp64: // >= 64B
+       DCBT    (R5)            // optimize for size>=64
        DCBT    (R6)            // cache hint
+
+       SRD     $6,R9,R14       // There is at least one iteration.
+       MOVD    R14,CTR
+       ANDCC   $63,R9,R9
+       CMP     R9,$16,CR1      // Do setup for tail check early on.
+       CMP     R9,$32,CR2
+       CMP     R9,$48,CR3
+       ADD     $-16,R9,R9
+
        MOVD    $32,R11         // set offsets to load into vector
        MOVD    $48,R12         // set offsets to load into vector
 
-loop64a:// process size 64 and greater
+       PCALIGN $32
+cmp64_loop:
        LXVD2X  (R5)(R0),V3     // load bytes of A at offset 0 into vector
        LXVD2X  (R6)(R0),V4     // load bytes of B at offset 0 into vector
        VCMPEQUDCC      V3,V4,V1
@@ -112,391 +127,206 @@ loop64a:// process size 64 and greater
 
        LXVD2X  (R5)(R10),V3    // load bytes of A at offset 16 into vector
        LXVD2X  (R6)(R10),V4    // load bytes of B at offset 16 into vector
-
        VCMPEQUDCC      V3,V4,V1
        BGE     CR6,different
 
        LXVD2X  (R5)(R11),V3    // load bytes of A at offset 32 into vector
        LXVD2X  (R6)(R11),V4    // load bytes of B at offset 32 into vector
-
        VCMPEQUDCC      V3,V4,V1
        BGE     CR6,different
 
        LXVD2X  (R5)(R12),V3    // load bytes of A at offset 64 into vector
        LXVD2X  (R6)(R12),V4    // load bytes of B at offset 64 into vector
-
        VCMPEQUDCC      V3,V4,V1
        BGE     CR6,different
 
-       ADD     $-64,R9,R9      // reduce remaining size by 64
        ADD     $64,R5,R5       // increment to next 64 bytes of A
        ADD     $64,R6,R6       // increment to next 64 bytes of B
-       CMPU    R9,$64
-       BGE     loop64a         // loop back to loop64a only if there are >= 64 bytes remaining
-       
-       CMPU    R9,$32
-       BGE     cmp32           // loop to cmp32 if there are 32-64 bytes remaining
-       CMPU    R9,$0
-       BNE     rem             // loop to rem if the remainder is not 0
-
-       BEQ     CR2,equal       // remainder is zero, jump to equal if len(A)==len(B)
-       BLT     CR2,less        // jump to less if len(A)<len(B)
-       BR      greater         // jump to greater otherwise
-cmp32:
-       LXVD2X  (R5)(R0),V3     // load bytes of A at offset 0 into vector
-       LXVD2X  (R6)(R0),V4     // load bytes of B at offset 0 into vector
+       BDNZ    cmp64_loop
+       BC      $12,2,LR        // beqlr
+
+       // Finish out tail with minimal overlapped checking.
+       // Note, 0 tail is handled by beqlr above.
+       BLE     CR1,cmp64_tail_gt0
+       BLE     CR2,cmp64_tail_gt16
+       BLE     CR3,cmp64_tail_gt32
+
+cmp64_tail_gt48: // 49 - 63 B
+       LXVD2X  (R0)(R5),V3
+       LXVD2X  (R0)(R6),V4
+       VCMPEQUDCC      V3,V4,V1
+       BGE     CR6,different
 
+       LXVD2X  (R5)(R10),V3
+       LXVD2X  (R6)(R10),V4
        VCMPEQUDCC      V3,V4,V1
        BGE     CR6,different
 
-       LXVD2X  (R5)(R10),V3    // load bytes of A at offset 16 into vector
-       LXVD2X  (R6)(R10),V4    // load bytes of B at offset 16 into vector
+       LXVD2X  (R5)(R11),V3
+       LXVD2X  (R6)(R11),V4
+       VCMPEQUDCC      V3,V4,V1
+       BGE     CR6,different
 
+       BR cmp64_tail_gt0
+
+       PCALIGN $16
+cmp64_tail_gt32: // 33 - 48B
+       LXVD2X  (R0)(R5),V3
+       LXVD2X  (R0)(R6),V4
        VCMPEQUDCC      V3,V4,V1
        BGE     CR6,different
 
-       ADD     $-32,R9,R9      // reduce remaining size by 32
-       ADD     $32,R5,R5       // increment to next 32 bytes of A
-       ADD     $32,R6,R6       // increment to next 32 bytes of B
-       CMPU    R9,$0
-       BNE     rem             // loop to rem if the remainder is not 0
-       BEQ     CR2,equal       // remainder is zero, jump to equal if len(A)==len(B)
-       BLT     CR2,less        // jump to less if len(A)<len(B)
-       BR      greater         // jump to greater otherwise
-rem:
-       MOVD    R9,R8
-       ANDCC   $24,R8,R9       // Any 8 byte chunks?
-       BEQ     leftover        // and result is 0
-       BR      setup8a
+       LXVD2X  (R5)(R10),V3
+       LXVD2X  (R6)(R10),V4
+       VCMPEQUDCC      V3,V4,V1
+       BGE     CR6,different
 
-different:
-#ifdef GOARCH_ppc64le
-       MOVD    $byteswap<>+00(SB), R16
-       LXVD2X  (R16)(R0),SWAP  // Set up swap string
+       BR cmp64_tail_gt0
 
-       VPERM   V3,V3,SWAP,V3
-       VPERM   V4,V4,SWAP,V4
-#endif
-       MFVSRD  VS35,R16        // move upper doublwords of A and B into GPR for comparison
-       MFVSRD  VS36,R10
+       PCALIGN $16
+cmp64_tail_gt16: // 17 - 32B
+       LXVD2X  (R0)(R5),V3
+       LXVD2X  (R0)(R6),V4
+       VCMPEQUDCC      V3,V4,V1
+       BGE     CR6,different
 
-       CMPU    R16,R10
-       BEQ     lower
-       BGT     greater
-       MOVD    $-1,R3          // return value if A < B
-       RET
-lower:
-       VSLDOI  $8,V3,V3,V3     // move lower doublwords of A and B into GPR for comparison
-       MFVSRD  VS35,R16
-       VSLDOI  $8,V4,V4,V4
-       MFVSRD  VS36,R10
+       BR cmp64_tail_gt0
+
+       PCALIGN $16
+cmp64_tail_gt0: // 1 - 16B
+       LXVD2X  (R5)(R9),V3
+       LXVD2X  (R6)(R9),V4
+       VCMPEQUDCC      V3,V4,V1
+       BGE     CR6,different
 
-       CMPU    R16,R10
-       BGT     greater
-       MOVD    $-1,R3          // return value if A < B
        RET
-setup8a:
-       SRADCC  $3,R8,R9        // get the 8 byte count
-       BEQ     leftover        // shifted value is 0
-       CMPU    R8,$8           // optimize 8byte move
-       BEQ     size8
-       CMPU    R8,$16
-       BEQ     size16
-       MOVD    R9,CTR          // loop count for doublewords
-loop8:
-#ifdef  GOARCH_ppc64le
-       MOVDBR  (R5+R0),R16     // doublewords to compare
-       MOVDBR  (R6+R0),R10     // LE compare order
-#else
-       MOVD    (R5+R0),R16     // doublewords to compare
-       MOVD    (R6+R0),R10     // BE compare order
-#endif
-       ADD     $8,R5
-       ADD     $8,R6
-       CMPU    R16,R10         // match?
-       BC      8,2,loop8       // bt ctr <> 0 && cr
-       BGT     greater
-       BLT     less
-leftover:
-       ANDCC   $7,R8,R9        // check for leftover bytes
-       BEQ     zeroremainder
-simplecheck:
-       MOVD    R0,R14
-       CMP     R9,$4           // process 4 bytes
-       BLT     halfword
-#ifdef  GOARCH_ppc64le
-       MOVWBR  (R5)(R14),R10
-       MOVWBR  (R6)(R14),R11
-#else
-       MOVWZ   (R5)(R14),R10
-       MOVWZ   (R6)(R14),R11
-#endif
-       CMPU    R10,R11
-       BGT     greater
-       BLT     less
-       ADD     $-4,R9
-       ADD     $4,R14
-       PCALIGN $16
 
-halfword:
-       CMP     R9,$2           // process 2 bytes
-       BLT     byte
-#ifdef  GOARCH_ppc64le
-       MOVHBR  (R5)(R14),R10
-       MOVHBR  (R6)(R14),R11
-#else
-       MOVHZ   (R5)(R14),R10
-       MOVHZ   (R6)(R14),R11
-#endif
-       CMPU    R10,R11
-       BGT     greater
-       BLT     less
-       ADD     $-2,R9
-       ADD     $2,R14
-       PCALIGN $16
-byte:
-       CMP     R9,$0           // process 1 byte
-       BEQ     skip
-       MOVBZ   (R5)(R14),R10
-       MOVBZ   (R6)(R14),R11
-       CMPU    R10,R11
-       BGT     greater
-       BLT     less
-       PCALIGN $16
-skip:
-       BEQ     CR2,equal
-       BGT     CR2,greater
+       PCALIGN $16
+cmp32: // 32 - 63B
+       ANDCC   $31,R9,R9
 
-less:  MOVD    $-1,R3          // return value if A < B
-       RET
-size16:
-       LXVD2X  (R5)(R0),V3     // load bytes of A at offset 0 into vector
-       LXVD2X  (R6)(R0),V4     // load bytes of B at offset 0 into vector
+       LXVD2X  (R0)(R5),V3
+       LXVD2X  (R0)(R6),V4
        VCMPEQUDCC      V3,V4,V1
        BGE     CR6,different
-zeroremainder:
-       BEQ     CR2,equal       // remainder is zero, jump to equal if len(A)==len(B)
-       BLT     CR2,less        // jump to less if len(A)<len(B)
-       BR      greater         // jump to greater otherwise
-size8:
-#ifdef  GOARCH_ppc64le
-       MOVDBR  (R5+R0),R16     // doublewords to compare
-       MOVDBR  (R6+R0),R10     // LE compare order
-#else
-       MOVD    (R5+R0),R16     // doublewords to compare
-       MOVD    (R6+R0),R10     // BE compare order
-#endif
-       CMPU    R16,R10         // match?
-       BGT     greater
-       BLT     less
-       BGT     CR2,greater     // 2nd len > 1st len
-       BLT     CR2,less        // 2nd len < 1st len
-equal:
-       MOVD    $0, R3          // return value if A == B
-       RET
-greater:
-       MOVD    $1,R3           // return value if A > B
-       RET
 
-// Do an efficient memcmp for ppc64le/ppc64/POWER9
-// R3 = a len
-// R4 = b len
-// R5 = a addr
-// R6 = b addr
-// On exit:
-// R3 = return value
-TEXT cmpbodyp9<>(SB),NOSPLIT|NOFRAME,$0-0
-       MOVD    R3,R8           // set up length
-       CMP     R3,R4,CR2       // unequal?
-       BLT     CR2,setuplen    // BLT CR2
-       MOVD    R4,R8           // use R4 for comparison len
-setuplen:
-       CMP     R8,$16          // optimize for size<16
-       MOVD    R8,R9
-       BLT     simplecheck
-       MOVD    $16,R10         // set offsets to load into vectors
-       CMP     R8,$32          // optimize for size 16-31
-       BLT     cmp16
-       CMP     R8,$64
-       BLT     cmp32           // optimize for size 32-63
-       DCBT    (R5)            // optimize for size>=64
-       DCBT    (R6)            // cache hint
+       LXVD2X  (R10)(R5),V3
+       LXVD2X  (R10)(R6),V4
+       VCMPEQUDCC      V3,V4,V1
+       BGE     CR6,different
 
-       MOVD    $32,R11         // set offsets to load into vector
-       MOVD    $48,R12         // set offsets to load into vector
+       BC      $12,2,LR        // beqlr
+       ADD     R9,R10,R10
 
-loop64a:// process size 64 and greater
-       LXVB16X (R0)(R5),V3     // load bytes of A at offset 0 into vector
-       LXVB16X (R0)(R6),V4     // load bytes of B at offset 0 into vector
-       VCMPNEBCC       V3,V4,V1        // record comparison into V1
-       BNE     CR6,different   // jump out if its different
+       LXVD2X  (R9)(R5),V3
+       LXVD2X  (R9)(R6),V4
+       VCMPEQUDCC      V3,V4,V1
+       BGE     CR6,different
 
-       LXVB16X (R10)(R5),V3    // load bytes of A at offset 16 into vector
-       LXVB16X (R10)(R6),V4    // load bytes of B at offset 16 into vector
-       VCMPNEBCC       V3,V4,V1
-       BNE     CR6,different
+       LXVD2X  (R10)(R5),V3
+       LXVD2X  (R10)(R6),V4
+       VCMPEQUDCC      V3,V4,V1
+       BGE     CR6,different
+       RET
 
-       LXVB16X (R11)(R5),V3    // load bytes of A at offset 32 into vector
-       LXVB16X (R11)(R6),V4    // load bytes of B at offset 32 into vector
-       VCMPNEBCC       V3,V4,V1
-       BNE     CR6,different
+       PCALIGN $16
+cmp16: // 16 - 31B
+       ANDCC   $15,R9,R9
+       LXVD2X  (R0)(R5),V3
+       LXVD2X  (R0)(R6),V4
+       VCMPEQUDCC      V3,V4,V1
+       BGE     CR6,different
+       BC      $12,2,LR        // beqlr
 
-       LXVB16X (R12)(R5),V3    // load bytes of A at offset 48 into vector
-       LXVB16X (R12)(R6),V4    // load bytes of B at offset 48 into vector
-       VCMPNEBCC       V3,V4,V1
-       BNE     CR6,different
+       LXVD2X  (R9)(R5),V3
+       LXVD2X  (R9)(R6),V4
+       VCMPEQUDCC      V3,V4,V1
+       BGE     CR6,different
+       RET
 
-       ADD     $-64,R9,R9      // reduce remaining size by 64
-       ADD     $64,R5,R5       // increment to next 64 bytes of A
-       ADD     $64,R6,R6       // increment to next 64 bytes of B
-       CMPU    R9,$64
-       BGE     loop64a         // loop back to loop64a only if there are >= 64 bytes remaining
-
-       CMPU    R9,$32
-       BGE     cmp32           // loop to cmp32 if there are 32-64 bytes remaining
-       CMPU    R9,$16
-       BGE     cmp16           // loop to cmp16 if there are 16-31 bytes left
-       CMPU    R9,$0
-       BNE     simplecheck     // loop to simplecheck for remaining bytes
-
-       BEQ     CR2,equal       // remainder is zero, jump to equal if len(A)==len(B)
-       BLT     CR2,less        // jump to less if len(A)<len(B)
-       BR      greater         // jump to greater otherwise
-cmp32:
-       LXVB16X (R0)(R5),V3     // load bytes of A at offset 0 into vector
-       LXVB16X (R0)(R6),V4     // load bytes of B at offset 0 into vector
-
-       VCMPNEBCC       V3,V4,V1        // record comparison into V1
-       BNE     CR6,different   // jump out if its different
-
-       LXVB16X (R10)(R5),V3    // load bytes of A at offset 16 into vector
-       LXVB16X (R10)(R6),V4    // load bytes of B at offset 16 into vector
-       VCMPNEBCC       V3,V4,V1
-       BNE     CR6,different
-
-       ADD     $-32,R9,R9      // reduce remaining size by 32
-       ADD     $32,R5,R5       // increment to next 32 bytes of A
-       ADD     $32,R6,R6       // increment to next 32 bytes of B
-       CMPU    R9,$16          // loop to cmp16 if there are 16-31 bytes left
-       BGE     cmp16
-       CMPU    R9,$0
-       BNE     simplecheck     // loop to simplecheck for remainder bytes
-       BEQ     CR2,equal       // remainder is zero, jump to equal if len(A)==len(B)
-       BLT     CR2,less        // jump to less if len(A)<len(B)
-       BR      greater         // jump to greater otherwise
+       PCALIGN $16
 different:
+#ifdef GOARCH_ppc64le
+       MOVD    $byteswap<>+00(SB),R16
+       LXVD2X  (R16)(R0),SWAP  // Set up swap string
 
-       MFVSRD  VS35,R16        // move upper doublwords of A and B into GPR for comparison
+       VPERM   V3,V3,SWAP,V3
+       VPERM   V4,V4,SWAP,V4
+#endif
+
+       MFVSRD  VS35,R16        // move upper doublewords of A and B into GPR for comparison
        MFVSRD  VS36,R10
 
        CMPU    R16,R10
        BEQ     lower
-       BGT     greater
-       MOVD    $-1,R3          // return value if A < B
+       SETB_CR0_NE(R3)
        RET
+
+       PCALIGN $16
 lower:
-       MFVSRLD VS35,R16        // next move lower doublewords of A and B into GPR for comparison
-       MFVSRLD VS36,R10
+       VSLDOI  $8,V3,V3,V3     // move lower doublewords of A and B into GPR for comparison
+       MFVSRD  VS35,R16
+       VSLDOI  $8,V4,V4,V4
+       MFVSRD  VS36,R10
 
        CMPU    R16,R10
-       BGT     greater
-       MOVD    $-1,R3          // return value if A < B
+       SETB_CR0_NE(R3)
        RET
 
-greater:
-       MOVD    $1,R3           // return value if A > B
+       PCALIGN $16
+cmp8:  // 8 - 15B
+       CMP     R9,$8
+       BLT     cmp4
+       ANDCC   $7,R9,R9
+       _LDBEX  (R0)(R5),R10
+       _LDBEX  (R0)(R6),R11
+       _LDBEX  (R9)(R5),R12
+       _LDBEX  (R9)(R6),R14
+       CMPU    R10,R11,CR0
+       SETB_CR0(R5)
+       CMPU    R12,R14,CR1
+       SETB_CR1(R6)
+       CRAND   CR0EQ,CR1EQ,CR1EQ // If both equal, length determines return value.
+       ISEL    CR0EQ,R6,R5,R4
+       ISEL    CR1EQ,R3,R4,R3
        RET
-cmp16:
-       ANDCC   $16,R9,R31
-       BEQ     tail
-
-       LXVB16X (R0)(R5),V3     // load bytes of A at offset 16 into vector
-       LXVB16X (R0)(R6),V4     // load bytes of B at offset 16 into vector
-       VCMPEQUDCC      V3,V4,V1
-       BGE     CR6,different
-
-       ADD     $16,R5
-       ADD     $16,R6
-tail:
-       ANDCC   $15,R9          // Load the last 16 bytes (we know there are at least 32b)
-       BEQ     end
-
-       ADD     R9,R5
-       ADD     R9,R6
-       MOVD    $-16,R10
 
-       LXVB16X (R10)(R5),V3    // load bytes of A at offset 16 into vector
-       LXVB16X (R10)(R6),V4    // load bytes of B at offset 16 into vector
-       VCMPEQUDCC      V3,V4,V1
-       BGE     CR6,different
-end:
-       BEQ     CR2,equal       // remainder is zero, jump to equal if len(A)==len(B)
-       BLT     CR2,less        // jump to less if BLT CR2 that is, len(A)<len(B)
-       BR      greater         // jump to greater otherwise
-simplecheck:
-       MOVD    $0,R14          // process 8 bytes
-       CMP     R9,$8
-       BLT     word
-#ifdef  GOARCH_ppc64le
-       MOVDBR  (R5+R14),R10
-       MOVDBR  (R6+R14),R11
-#else
-       MOVD    (R5+R14),R10
-       MOVD    (R6+R14),R11
-#endif
-       CMPU    R10,R11
-       BGT     greater
-       BLT     less
-       ADD     $8,R14
-       ADD     $-8,R9
        PCALIGN $16
-word:
-       CMP     R9,$4           // process 4 bytes
-       BLT     halfword
-#ifdef  GOARCH_ppc64le
-       MOVWBR  (R5+R14),R10
-       MOVWBR  (R6+R14),R11
-#else
-       MOVWZ   (R5+R14),R10
-       MOVWZ   (R6+R14),R11
-#endif
+cmp4:  // 4 - 7B
+       CMP     R9,$4
+       BLT     cmp2
+       ANDCC   $3,R9,R9
+       _LWBEX  (R0)(R5),R10
+       _LWBEX  (R0)(R6),R11
+       _LWBEX  (R9)(R5),R12
+       _LWBEX  (R9)(R6),R14
+       RLDIMI  $32,R10,$0,R12
+       RLDIMI  $32,R11,$0,R14
+       CMPU    R12,R14
+       BR      cmp0
+
+       PCALIGN $16
+cmp2:  // 2 - 3B
+       CMP     R9,$2
+       BLT     cmp1
+       ANDCC   $1,R9,R9
+       _LHBEX  (R0)(R5),R10
+       _LHBEX  (R0)(R6),R11
+       _LHBEX  (R9)(R5),R12
+       _LHBEX  (R9)(R6),R14
+       RLDIMI  $32,R10,$0,R12
+       RLDIMI  $32,R11,$0,R14
+       CMPU    R12,R14
+       BR      cmp0
+
+       PCALIGN $16
+cmp1:
+       CMP     R9,$0
+       BEQ     cmp0
+       MOVBZ   (R5),R10
+       MOVBZ   (R6),R11
        CMPU    R10,R11
-       BGT     greater
-       BLT     less
-       ADD     $4,R14
-       ADD     $-4,R9
-       PCALIGN $16
-halfword:
-       CMP     R9,$2           // process 2 bytes
-       BLT     byte
-#ifdef  GOARCH_ppc64le
-       MOVHBR  (R5+R14),R10
-       MOVHBR  (R6+R14),R11
-#else
-       MOVHZ   (R5+R14),R10
-       MOVHZ   (R6+R14),R11
-#endif
-       CMPU    R10,R11
-       BGT     greater
-       BLT     less
-       ADD     $2,R14
-       ADD     $-2,R9
-       PCALIGN $16
-byte:
-       CMP     R9,$0           // process 1 byte
-       BEQ     skip
-       MOVBZ   (R5+R14),R10
-       MOVBZ   (R6+R14),R11
-       CMPU    R10,R11
-       BGT     greater
-       BLT     less
-       PCALIGN $16
-skip:
-       BEQ     CR2,equal
-       BGT     CR2,greater
-less:
-       MOVD    $-1,R3          // return value if A < B
-       RET
-equal:
-       MOVD    $0, R3          // return value if A == B
+cmp0:
+       SETB_CR0(R6)
+       ISEL    CR0EQ,R3,R6,R3
        RET