From 0a3296eb96fca43a02b446f64ad5467e7f869eea Mon Sep 17 00:00:00 2001 From: Dusan Kostic Date: Tue, 25 Nov 2025 00:37:17 +0000 Subject: [PATCH 1/2] HOL Light proofs infrastructure for x86 and basemul proof Signed-off-by: Dusan Kostic --- .github/workflows/hol_light.yml | 98 +- BIBLIOGRAPHY.md | 4 +- flake.nix | 4 +- ...05-Configure-hol-sh-for-mlkem-native.patch | 4 +- nix/hol_light/default.nix | 8 +- nix/s2n_bignum/default.nix | 4 +- proofs/hol_light/{arm => }/README.md | 43 +- proofs/hol_light/arm/Makefile | 2 +- proofs/hol_light/arm/proofs/build-proof.sh | 2 +- .../arm/proofs/keccak_f1600_x1_scalar.ml | 4 +- .../arm/proofs/keccak_f1600_x1_v84a.ml | 4 +- .../arm/proofs/keccak_f1600_x2_v84a.ml | 4 +- .../arm/proofs/keccak_f1600_x4_v8a_scalar.ml | 4 +- .../proofs/keccak_f1600_x4_v8a_v84a_scalar.ml | 4 +- proofs/hol_light/arm/proofs/mlkem_intt.ml | 10 +- proofs/hol_light/arm/proofs/mlkem_ntt.ml | 10 +- ...m_poly_basemul_acc_montgomery_cached_k2.ml | 8 +- ...m_poly_basemul_acc_montgomery_cached_k3.ml | 8 +- ...m_poly_basemul_acc_montgomery_cached_k4.ml | 8 +- .../arm/proofs/mlkem_poly_mulcache_compute.ml | 8 +- .../hol_light/arm/proofs/mlkem_poly_reduce.ml | 8 +- .../arm/proofs/mlkem_poly_tobytes.ml | 8 +- .../hol_light/arm/proofs/mlkem_poly_tomont.ml | 8 +- .../hol_light/arm/proofs/mlkem_rej_uniform.ml | 10 +- proofs/hol_light/arm/proofs/mlkem_specs.ml | 470 ---- proofs/hol_light/common/mlkem_specs.ml | 952 ++++++++ proofs/hol_light/x86/Makefile | 136 ++ proofs/hol_light/x86/list_proofs.sh | 10 + ...em_poly_basemul_acc_montgomery_cached_k2.S | 496 ++++ ...em_poly_basemul_acc_montgomery_cached_k3.S | 744 ++++++ ...em_poly_basemul_acc_montgomery_cached_k4.S | 992 ++++++++ proofs/hol_light/x86/proofs/build-proof.sh | 69 + proofs/hol_light/x86/proofs/dump_bytecode.ml | 18 + ...m_poly_basemul_acc_montgomery_cached_k2.ml | 1170 +++++++++ ...m_poly_basemul_acc_montgomery_cached_k3.ml | 1665 +++++++++++++ ...m_poly_basemul_acc_montgomery_cached_k4.ml | 2166 +++++++++++++++++ scripts/autogen | 83 +- scripts/tests | 23 +- 38 files changed, 8690 insertions(+), 579 deletions(-) rename proofs/hol_light/{arm => }/README.md (70%) delete mode 100644 proofs/hol_light/arm/proofs/mlkem_specs.ml create mode 100644 proofs/hol_light/common/mlkem_specs.ml create mode 100644 proofs/hol_light/x86/Makefile create mode 100755 proofs/hol_light/x86/list_proofs.sh create mode 100644 proofs/hol_light/x86/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k2.S create mode 100644 proofs/hol_light/x86/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k3.S create mode 100644 proofs/hol_light/x86/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k4.S create mode 100755 proofs/hol_light/x86/proofs/build-proof.sh create mode 100644 proofs/hol_light/x86/proofs/dump_bytecode.ml create mode 100644 proofs/hol_light/x86/proofs/mlkem_poly_basemul_acc_montgomery_cached_k2.ml create mode 100644 proofs/hol_light/x86/proofs/mlkem_poly_basemul_acc_montgomery_cached_k3.ml create mode 100644 proofs/hol_light/x86/proofs/mlkem_poly_basemul_acc_montgomery_cached_k4.ml diff --git a/.github/workflows/hol_light.yml b/.github/workflows/hol_light.yml index 2133eadcfc..5d1d4c1b94 100644 --- a/.github/workflows/hol_light.yml +++ b/.github/workflows/hol_light.yml @@ -12,6 +12,9 @@ on: - 'proofs/hol_light/arm/Makefile' - 'proofs/hol_light/arm/**/*.S' - 'proofs/hol_light/arm/**/*.ml' + - 'proofs/hol_light/x86/Makefile' + - 'proofs/hol_light/x86/**/*.S' + - 'proofs/hol_light/x86/**/*.ml' - 'flake.nix' - 'flake.lock' - 'nix/hol_light/*' @@ -23,6 +26,9 @@ on: - 'proofs/hol_light/arm/Makefile' - 'proofs/hol_light/arm/**/*.S' - 'proofs/hol_light/arm/**/*.ml' + - 'proofs/hol_light/x86/Makefile' + - 'proofs/hol_light/x86/**/*.S' + - 'proofs/hol_light/x86/**/*.ml' - 'flake.nix' - 'flake.lock' - 'nix/hol_light/*' @@ -37,7 +43,7 @@ jobs: # but we use this as a fast path to not even start the proofs # if the byte code needs updating. hol_light_bytecode: - name: HOL-Light bytecode check + name: AArch64 HOL-Light bytecode check runs-on: pqcp-arm64 if: github.repository_owner == 'pq-code-package' && !github.event.pull_request.head.repo.fork steps: @@ -51,7 +57,7 @@ jobs: script: | autogen --update-hol-light-bytecode --dry-run hol_light_interactive: - name: HOL-Light interactive shell test + name: AArch64 HOL-Light interactive shell test runs-on: pqcp-arm64 needs: [ hol_light_bytecode ] if: github.repository_owner == 'pq-code-package' && !github.event.pull_request.head.repo.fork @@ -65,7 +71,7 @@ jobs: nix-shell: 'hol_light' script: | make -C proofs/hol_light/arm mlkem/mlkem_poly_tobytes.o - echo 'needs "proofs/mlkem_poly_tobytes.ml";;' | hol.sh + echo 'needs "arm/proofs/mlkem_poly_tobytes.ml";;' | hol.sh hol_light_proofs: needs: [ hol_light_bytecode ] strategy: @@ -103,7 +109,7 @@ jobs: needs: ["keccak_specs.ml"] - name: keccak_f1600_x4_v8a_scalar needs: ["keccak_specs.ml"] - name: HOL Light proof for ${{ matrix.proof.name }}.S + name: AArch64 HOL Light proof for ${{ matrix.proof.name }}.S runs-on: pqcp-arm64 if: github.repository_owner == 'pq-code-package' && !github.event.pull_request.head.repo.fork steps: @@ -142,3 +148,87 @@ jobs: nix-shell: 'hol_light' script: | tests hol_light -p ${{ matrix.proof.name }} --verbose + + # x86_64 proofs + hol_light_bytecode_x86_64: + name: x86_64 HOL-Light bytecode check + runs-on: pqcp-x64 + if: github.repository_owner == 'pq-code-package' && !github.event.pull_request.head.repo.fork + steps: + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + fetch-depth: 0 + - uses: ./.github/actions/setup-shell + with: + gh_token: ${{ secrets.GITHUB_TOKEN }} + nix-shell: 'hol_light' + script: | + autogen --update-hol-light-bytecode --dry-run + hol_light_interactive_x86_64: + name: x86_64 HOL-Light interactive shell test + runs-on: pqcp-x64 + needs: [ hol_light_bytecode_x86_64 ] + if: github.repository_owner == 'pq-code-package' && !github.event.pull_request.head.repo.fork + steps: + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + fetch-depth: 0 + - uses: ./.github/actions/setup-shell + with: + gh_token: ${{ secrets.GITHUB_TOKEN }} + nix-shell: 'hol_light' + script: | + make -C proofs/hol_light/x86 mlkem/mlkem_poly_basemul_acc_montgomery_cached_k2.o + echo 'needs "x86/proofs/mlkem_poly_basemul_acc_montgomery_cached_k2.ml";;' | hol.sh + hol_light_proofs_x86_64: + needs: [ hol_light_bytecode_x86_64 ] + strategy: + fail-fast: false + matrix: + proof: + # Dependencies on {name}.{S,ml} are implicit + - name: mlkem_poly_basemul_acc_montgomery_cached_k2 + needs: ["mlkem_specs.ml"] + - name: mlkem_poly_basemul_acc_montgomery_cached_k3 + needs: ["mlkem_specs.ml"] + - name: mlkem_poly_basemul_acc_montgomery_cached_k4 + needs: ["mlkem_specs.ml"] + name: x86_64 HOL Light proof for ${{ matrix.proof.name }}.S + runs-on: pqcp-x64 + if: github.repository_owner == 'pq-code-package' && !github.event.pull_request.head.repo.fork + steps: + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + with: + fetch-depth: 0 + - name: Get changed files + id: changed-files + uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62 # v47.0.0 + - name: Check if dependencies changed + id: check_run + shell: bash + run: | + run_needed=0 + changed_files="${{ steps.changed-files.outputs.all_changed_files }}" + dependencies="${{ join(matrix.proof.needs, ' ') }} ${{ format('{0}.S {0}.ml', matrix.proof.name) }}" + for changed in $changed_files; do + for needs in $dependencies; do + if [[ "$changed" == *"$needs" ]]; then + run_needed=1 + fi + done + done + + # Always re-run upon change to nix files for HOL-Light + if [[ "$changed_files" == *"nix/"* ]] || [[ "$changed_files" == *"hol_light.yml"* ]] || [[ "$changed_files" == *"flake"* ]] || [[ "$changed_files" == *"proofs/hol_light/x86/Makefile"* ]]; then + run_needed=1 + fi + + echo "run_needed=${run_needed}" >> $GITHUB_OUTPUT + - uses: ./.github/actions/setup-shell + if: | + steps.check_run.outputs.run_needed == '1' + with: + gh_token: ${{ secrets.GITHUB_TOKEN }} + nix-shell: 'hol_light' + script: | + tests hol_light -p ${{ matrix.proof.name }} --verbose diff --git a/BIBLIOGRAPHY.md b/BIBLIOGRAPHY.md index 048547aa1b..090fa7b4a3 100644 --- a/BIBLIOGRAPHY.md +++ b/BIBLIOGRAPHY.md @@ -113,7 +113,7 @@ source code and documentation. - [mlkem/src/fips202/native/aarch64/auto.h](mlkem/src/fips202/native/aarch64/auto.h) - [mlkem/src/fips202/native/aarch64/src/keccak_f1600_x1_v84a_asm.S](mlkem/src/fips202/native/aarch64/src/keccak_f1600_x1_v84a_asm.S) - [mlkem/src/fips202/native/aarch64/src/keccak_f1600_x2_v84a_asm.S](mlkem/src/fips202/native/aarch64/src/keccak_f1600_x2_v84a_asm.S) - - [proofs/hol_light/arm/README.md](proofs/hol_light/arm/README.md) + - [proofs/hol_light/README.md](proofs/hol_light/README.md) - [proofs/hol_light/arm/mlkem/keccak_f1600_x1_v84a.S](proofs/hol_light/arm/mlkem/keccak_f1600_x1_v84a.S) - [proofs/hol_light/arm/mlkem/keccak_f1600_x2_v84a.S](proofs/hol_light/arm/mlkem/keccak_f1600_x2_v84a.S) @@ -248,7 +248,7 @@ source code and documentation. * URL: https://github.com/slothy-optimizer/slothy/ * Referenced from: - [dev/README.md](dev/README.md) - - [proofs/hol_light/arm/README.md](proofs/hol_light/arm/README.md) + - [proofs/hol_light/README.md](proofs/hol_light/README.md) ### `SLOTHY_Paper` diff --git a/flake.nix b/flake.nix index f7bb2975f8..0b9c73fd5d 100644 --- a/flake.nix +++ b/flake.nix @@ -101,8 +101,8 @@ }).overrideAttrs (old: { shellHook = '' export PATH=$PWD/scripts:$PATH - # Set PROOF_DIR_ARM based on where we entered the shell - export PROOF_DIR_ARM="$PWD/proofs/hol_light/arm" + # Set PROOF_DIR based on where we entered the shell + export PROOF_DIR="$PWD/proofs/hol_light" ''; }); devShells.ci = util.mkShell { diff --git a/nix/hol_light/0005-Configure-hol-sh-for-mlkem-native.patch b/nix/hol_light/0005-Configure-hol-sh-for-mlkem-native.patch index 662af41d64..5b849449cb 100644 --- a/nix/hol_light/0005-Configure-hol-sh-for-mlkem-native.patch +++ b/nix/hol_light/0005-Configure-hol-sh-for-mlkem-native.patch @@ -21,10 +21,10 @@ index 1311255..8b2bc36 100755 +SITELIB=$(dirname $(ocamlfind query findlib 2>/dev/null) 2>/dev/null) + +# Set HOLLIGHT_LOAD_PATH to include S2N_BIGNUM_DIR and mlkem-native proofs -+export HOLLIGHT_LOAD_PATH="${S2N_BIGNUM_DIR}:${PROOF_DIR_ARM}:${HOLLIGHT_LOAD_PATH}" ++export HOLLIGHT_LOAD_PATH="${S2N_BIGNUM_DIR}:${PROOF_DIR}:${HOLLIGHT_LOAD_PATH}" + +# Change to mlkem-native proofs directory if set, so define_from_elf can find object files -+[ -n "${PROOF_DIR_ARM}" ] && cd "${PROOF_DIR_ARM}" ++[ -n "${PROOF_DIR}" ] && cd "${PROOF_DIR}" + +${LINE_EDITOR} ${HOLLIGHT_DIR}/ocaml-hol -init ${HOL_ML_PATH} -I ${HOLLIGHT_DIR} ${SITELIB:+-I "$SITELIB"} diff --git a/hol_4.sh b/hol_4.sh diff --git a/nix/hol_light/default.nix b/nix/hol_light/default.nix index 31f1795b62..5017ca1056 100644 --- a/nix/hol_light/default.nix +++ b/nix/hol_light/default.nix @@ -7,12 +7,12 @@ hol_light.overrideAttrs (old: { export HOLLIGHT_DIR="$1/lib/hol_light" export PATH="$1/lib/hol_light:$PATH" ''; - version = "unstable-2025-09-22"; + version = "unstable-2025-11-17"; src = fetchFromGitHub { owner = "jrh13"; repo = "hol-light"; - rev = "bed58fa74649fa74015176f8f90e77f7af5cf8e3"; - hash = "sha256-QDubbUUChvv04239BdcKPSU+E2gdSzqAWfAETK2Xtg0="; + rev = "08bcac75772d37c2447a90c90d1dff9ab415f217"; + hash = "sha256-kYOzGW7uQGOM/b+JPWQfpqqtgMmMku/BkN58WZTtokU="; }; patches = [ ./0005-Configure-hol-sh-for-mlkem-native.patch @@ -20,6 +20,8 @@ hol_light.overrideAttrs (old: { ]; propagatedBuildInputs = old.propagatedBuildInputs ++ old.nativeBuildInputs ++ [ ocamlPackages.pcre2 ledit ]; buildPhase = '' + patchShebangs pa_j/chooser.sh + patchShebangs update_database/chooser.sh HOLLIGHT_USE_MODULE=1 make hol.sh patchShebangs hol.sh HOLLIGHT_USE_MODULE=1 make diff --git a/nix/s2n_bignum/default.nix b/nix/s2n_bignum/default.nix index 67748e542d..d8575ddcdc 100644 --- a/nix/s2n_bignum/default.nix +++ b/nix/s2n_bignum/default.nix @@ -2,12 +2,12 @@ { stdenv, fetchFromGitHub, writeText, ... }: stdenv.mkDerivation rec { pname = "s2n_bignum"; - version = "2ab2252b8505e58a7c3392f8ad823782032b61e7"; + version = "84a604317b94cbca9f14c7b2b771afc2827ab99f"; src = fetchFromGitHub { owner = "awslabs"; repo = "s2n-bignum"; rev = "${version}"; - hash = "sha256-7lil3jAFo5NiyNOSBYZcRjduXkotV3x4PlxXSKt63M8="; + hash = "sha256-J2tVZ2x23ZHP+ZNkbiUqyaci5bu4zNfkXuRemnuB+N0="; }; setupHook = writeText "setup-hook.sh" '' export S2N_BIGNUM_DIR="$1" diff --git a/proofs/hol_light/arm/README.md b/proofs/hol_light/README.md similarity index 70% rename from proofs/hol_light/arm/README.md rename to proofs/hol_light/README.md index 6e4dc1cd34..98f2bf7803 100644 --- a/proofs/hol_light/arm/README.md +++ b/proofs/hol_light/README.md @@ -2,12 +2,12 @@ # HOL Light functional correctness proofs -This directory contains functional correctness proofs for all AArch64 assembly routines -used in mlkem-native. The proofs were largely developed by John Harrison +This directory contains functional correctness proofs for all AArch64 and some x86_64 assembly routines +used in mlkem-native. The proofs were largely developed by John Harrison and Dusan Kostic and are written in the [HOL Light](https://hol-light.github.io/) theorem prover, utilizing the assembly verification infrastructure from [s2n-bignum](https://github.com/awslabs/s2n-bignum). -Each function is proved in a separate `.ml` file in [proofs/](proofs). Each file +Each function is proved in a separate `.ml` file in [arm/proofs/](arm/proofs) and [x86/proofs/](x86/proofs). Each file contains the byte code being verified, as well as the specification that is being proved. @@ -16,7 +16,7 @@ proved. Proofs are 'post-hoc' in the sense that HOL-Light/s2n-bignum operate on the final object code. In particular, the means by which the code was generated (including the [SLOTHY](https://github.com/slothy-optimizer/slothy/) superoptimizer) need not be trusted. Specifications are essentially [Hoare triples](https://en.wikipedia.org/wiki/Hoare_logic), with the noteworthy difference that the program is implicit as the content of memory at the PC; which is asserted to -be the code under verification as part of the precondition. For example, the following is the specification of the `poly_reduce` function: +be the code under verification as part of the precondition. For example, the following is the specification of the aarch64 `poly_reduce` function: ```ocaml (* For all (abbreviated by `!` in HOL): @@ -67,6 +67,11 @@ from mlkem-native's base directory. Then ```bash make -C proofs/hol_light/arm ``` +or + +```bash +make -C proofs/hol_light/x86 +``` will build and run the proofs. Note that this make take hours even on powerful machines. @@ -77,23 +82,27 @@ For convenience, you can also use `tests hol_light` which wraps the `make` invoc All AArch64 assembly routines used in mlkem-native are covered. Those are: - ML-KEM Arithmetic: - * AArch64 forward NTT: [mlkem_ntt.S](mlkem/mlkem_ntt.S) - * AArch64 inverse NTT: [mlkem_intt.S](mlkem/mlkem_intt.S) - * AArch64 base multiplications: [mlkem_poly_basemul_acc_montgomery_cached_k2.S](mlkem/mlkem_poly_basemul_acc_montgomery_cached_k2.S) [mlkem_poly_basemul_acc_montgomery_cached_k3.S](mlkem/mlkem_poly_basemul_acc_montgomery_cached_k3.S) [mlkem_poly_basemul_acc_montgomery_cached_k4.S](mlkem/mlkem_poly_basemul_acc_montgomery_cached_k4.S) - * AArch64 conversion to Montgomery form: [mlkem_poly_tomont.S](mlkem/mlkem_poly_tomont.S) - * AArch64 modular reduction: [mlkem_poly_reduce.S](mlkem/mlkem_poly_reduce.S) - * AArch64 'multiplication cache' computation: [mlkem_poly_mulcache_compute.S](mlkem/mlkem_poly_mulcache_compute.S) - * AArch64 rejection sampling: [mlkem_rej_uniform.S](mlkem/mlkem_rej_uniform.S) - * AArch64 polynomial compresseion: [mlkem_poly_tobytes.S](mlkem/mlkem_poly_tobytes.S) + * AArch64 forward NTT: [mlkem_ntt.S](arm/mlkem/mlkem_ntt.S) + * AArch64 inverse NTT: [mlkem_intt.S](arm/mlkem/mlkem_intt.S) + * AArch64 base multiplications: [mlkem_poly_basemul_acc_montgomery_cached_k2.S](arm/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k2.S) [mlkem_poly_basemul_acc_montgomery_cached_k3.S](arm/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k3.S) [mlkem_poly_basemul_acc_montgomery_cached_k4.S](arm/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k4.S) + * AArch64 conversion to Montgomery form: [mlkem_poly_tomont.S](arm/mlkem/mlkem_poly_tomont.S) + * AArch64 modular reduction: [mlkem_poly_reduce.S](arm/mlkem/mlkem_poly_reduce.S) + * AArch64 'multiplication cache' computation: [mlkem_poly_mulcache_compute.S](arm/mlkem/mlkem_poly_mulcache_compute.S) + * AArch64 rejection sampling: [mlkem_rej_uniform.S](arm/mlkem/mlkem_rej_uniform.S) + * AArch64 polynomial compresseion: [mlkem_poly_tobytes.S](arm/mlkem/mlkem_poly_tobytes.S) - FIPS202: - * Keccak-F1600 using lazy rotations[^HYBRID]: [keccak_f1600_x1_scalar.S](mlkem/keccak_f1600_x1_scalar.S) - * Keccak-F1600 using v8.4-A SHA3 instructions: [keccak_f1600_x1_v84a.S](mlkem/keccak_f1600_x1_v84a.S) - * 2-fold Keccak-F1600 using v8.4-A SHA3 instructions: [keccak_f1600_x2_v84a.S](mlkem/keccak_f1600_x2_v84a.S) - * 'Hybrid' 4-fold Keccak-F1600 using scalar and v8-A Neon instructions: [keccak_f1600_x4_v8a_scalar.S](mlkem/keccak_f1600_x4_v8a_scalar.S) - * 'Triple hybrid' 4-fold Keccak-F1600 using scalar, v8-A Neon and v8.4-A+SHA3 Neon instructions:[keccak_f1600_x4_v8a_v84a_scalar.S](mlkem/keccak_f1600_x4_v8a_v84a_scalar.S) + * Keccak-F1600 using lazy rotations[^HYBRID]: [keccak_f1600_x1_scalar.S](arm/mlkem/keccak_f1600_x1_scalar.S) + * Keccak-F1600 using v8.4-A SHA3 instructions: [keccak_f1600_x1_v84a.S](arm/mlkem/keccak_f1600_x1_v84a.S) + * 2-fold Keccak-F1600 using v8.4-A SHA3 instructions: [keccak_f1600_x2_v84a.S](arm/mlkem/keccak_f1600_x2_v84a.S) + * 'Hybrid' 4-fold Keccak-F1600 using scalar and v8-A Neon instructions: [keccak_f1600_x4_v8a_scalar.S](arm/mlkem/keccak_f1600_x4_v8a_scalar.S) + * 'Triple hybrid' 4-fold Keccak-F1600 using scalar, v8-A Neon and v8.4-A+SHA3 Neon instructions:[keccak_f1600_x4_v8a_v84a_scalar.S](arm/mlkem/keccak_f1600_x4_v8a_v84a_scalar.S) The NTT and invNTT functions are super-optimized using [SLOTHY](https://github.com/slothy-optimizer/slothy/). +The following x86_64 assembly routines used in mlkem-native are covered: +- ML-KEM Arithmetic: + * x86_64 base multiplications: [mlkem_poly_basemul_acc_montgomery_cached_k2.S](x86/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k2.S) [mlkem_poly_basemul_acc_montgomery_cached_k3.S](x86/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k3.S) [mlkem_poly_basemul_acc_montgomery_cached_k4.S](x86/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k4.S) + [^HYBRID]: Becker, Kannwischer: Hybrid scalar/vector implementations of Keccak and SPHINCS+ on AArch64, [https://eprint.iacr.org/2022/1243](https://eprint.iacr.org/2022/1243) [^SLOTHY]: Abdulrahman, Becker, Kannwischer, Klein: SLOTHY superoptimizer, [https://github.com/slothy-optimizer/slothy/](https://github.com/slothy-optimizer/slothy/) diff --git a/proofs/hol_light/arm/Makefile b/proofs/hol_light/arm/Makefile index e43d435921..2c1f2c1fcf 100644 --- a/proofs/hol_light/arm/Makefile +++ b/proofs/hol_light/arm/Makefile @@ -130,7 +130,7 @@ proofs/dump_bytecode.native: proofs/dump_bytecode.ml $(OBJ) # Run them and print the standard output+error at *.correct %.correct: %.native - $< 2>&1 | tee $@ + cd .. ; ./arm/$< 2>&1 | tee ./arm/$@ @if (grep -i "error:\|exception:" "$@" >/dev/null); then \ echo "$< had errors!"; \ exit 1; \ diff --git a/proofs/hol_light/arm/proofs/build-proof.sh b/proofs/hol_light/arm/proofs/build-proof.sh index c2c44eae42..c8735afafb 100755 --- a/proofs/hol_light/arm/proofs/build-proof.sh +++ b/proofs/hol_light/arm/proofs/build-proof.sh @@ -16,7 +16,7 @@ # - Removal of s2n-bignum specific code that is not relevant for # the mlkem-native proofs. -ROOT="$(realpath "$(dirname "$0")"/../)" +ROOT="$(realpath "$(dirname "$0")"/../../)" if [ "$#" -ne 3 ]; then echo "${ROOT}/build-proof.sh <.ml file path> " diff --git a/proofs/hol_light/arm/proofs/keccak_f1600_x1_scalar.ml b/proofs/hol_light/arm/proofs/keccak_f1600_x1_scalar.ml index c7640e2003..4a8dc6a49f 100644 --- a/proofs/hol_light/arm/proofs/keccak_f1600_x1_scalar.ml +++ b/proofs/hol_light/arm/proofs/keccak_f1600_x1_scalar.ml @@ -8,13 +8,13 @@ (* ========================================================================= *) needs "arm/proofs/base.ml";; -needs "proofs/keccak_spec.ml";; +needs "arm/proofs/keccak_spec.ml";; (**** print_literal_from_elf "mlkem/keccak_f1600_x1_scalar.o";; ****) let keccak_f1600_x1_scalar_mc = define_assert_from_elf - "keccak_f1600_x1_scalar_mc" "mlkem/keccak_f1600_x1_scalar.o" + "keccak_f1600_x1_scalar_mc" "arm/mlkem/keccak_f1600_x1_scalar.o" (*** BYTECODE START ***) [ 0xd10203ff; (* arm_SUB SP SP (rvalue (word 128)) *) diff --git a/proofs/hol_light/arm/proofs/keccak_f1600_x1_v84a.ml b/proofs/hol_light/arm/proofs/keccak_f1600_x1_v84a.ml index 669964d2ef..7aa846a04a 100644 --- a/proofs/hol_light/arm/proofs/keccak_f1600_x1_v84a.ml +++ b/proofs/hol_light/arm/proofs/keccak_f1600_x1_v84a.ml @@ -8,13 +8,13 @@ (* ========================================================================= *) needs "arm/proofs/base.ml";; -needs "proofs/keccak_spec.ml";; +needs "arm/proofs/keccak_spec.ml";; (**** print_literal_from_elf "arm/mlkem/keccak_f1600_x1_v84a.o";; ****) let keccak_f1600_x1_v84a_mc = define_assert_from_elf - "keccak_f1600_x1_v84a_mc" "mlkem/keccak_f1600_x1_v84a.o" + "keccak_f1600_x1_v84a_mc" "arm/mlkem/keccak_f1600_x1_v84a.o" (*** BYTECODE START ***) [ 0xd10103ff; (* arm_SUB SP SP (rvalue (word 64)) *) diff --git a/proofs/hol_light/arm/proofs/keccak_f1600_x2_v84a.ml b/proofs/hol_light/arm/proofs/keccak_f1600_x2_v84a.ml index 1b45cec507..c367b49736 100644 --- a/proofs/hol_light/arm/proofs/keccak_f1600_x2_v84a.ml +++ b/proofs/hol_light/arm/proofs/keccak_f1600_x2_v84a.ml @@ -8,13 +8,13 @@ (* ========================================================================= *) needs "arm/proofs/base.ml";; -needs "proofs/keccak_spec.ml";; +needs "arm/proofs/keccak_spec.ml";; (**** print_literal_from_elf "arm/mlkem/keccak_f1600_x2_v84a.o";; ****) let keccak_f1600_x2_v84a_mc = define_assert_from_elf - "keccak_f1600_x2_v84a_mc" "mlkem/keccak_f1600_x2_v84a.o" + "keccak_f1600_x2_v84a_mc" "arm/mlkem/keccak_f1600_x2_v84a.o" (*** BYTECODE START ***) [ 0xd10103ff; (* arm_SUB SP SP (rvalue (word 64)) *) diff --git a/proofs/hol_light/arm/proofs/keccak_f1600_x4_v8a_scalar.ml b/proofs/hol_light/arm/proofs/keccak_f1600_x4_v8a_scalar.ml index 3ad9efd061..ecc9819d14 100644 --- a/proofs/hol_light/arm/proofs/keccak_f1600_x4_v8a_scalar.ml +++ b/proofs/hol_light/arm/proofs/keccak_f1600_x4_v8a_scalar.ml @@ -8,13 +8,13 @@ (* ========================================================================= *) needs "arm/proofs/base.ml";; -needs "proofs/keccak_spec.ml";; +needs "arm/proofs/keccak_spec.ml";; (**** print_literal_from_elf "mlkem/keccak_f1600_x4_v8a_scalar.o";; ****) let keccak_f1600_x4_v8a_scalar_mc = define_assert_from_elf - "keccak_f1600_x4_v8a_scalar_mc" "mlkem/keccak_f1600_x4_v8a_scalar.o" + "keccak_f1600_x4_v8a_scalar_mc" "arm/mlkem/keccak_f1600_x4_v8a_scalar.o" (*** BYTECODE START ***) [ 0xd10383ff; (* arm_SUB SP SP (rvalue (word 224)) *) diff --git a/proofs/hol_light/arm/proofs/keccak_f1600_x4_v8a_v84a_scalar.ml b/proofs/hol_light/arm/proofs/keccak_f1600_x4_v8a_v84a_scalar.ml index 42982116ab..ed92f2df63 100644 --- a/proofs/hol_light/arm/proofs/keccak_f1600_x4_v8a_v84a_scalar.ml +++ b/proofs/hol_light/arm/proofs/keccak_f1600_x4_v8a_v84a_scalar.ml @@ -8,13 +8,13 @@ (* ========================================================================= *) needs "arm/proofs/base.ml";; -needs "proofs/keccak_spec.ml";; +needs "arm/proofs/keccak_spec.ml";; (**** print_literal_from_elf "mlkem/keccak_f1600_x4_v8a_v84a_scalar.o";; ****) let keccak_f1600_x4_v8a_v84a_scalar_mc = define_assert_from_elf - "keccak_f1600_x4_v8a_v84a_scalar_mc" "mlkem/keccak_f1600_x4_v8a_v84a_scalar.o" + "keccak_f1600_x4_v8a_v84a_scalar_mc" "arm/mlkem/keccak_f1600_x4_v8a_v84a_scalar.o" (*** BYTECODE START ***) [ 0xd10383ff; (* arm_SUB SP SP (rvalue (word 224)) *) diff --git a/proofs/hol_light/arm/proofs/mlkem_intt.ml b/proofs/hol_light/arm/proofs/mlkem_intt.ml index 88228f6331..a34a128004 100644 --- a/proofs/hol_light/arm/proofs/mlkem_intt.ml +++ b/proofs/hol_light/arm/proofs/mlkem_intt.ml @@ -9,15 +9,15 @@ needs "arm/proofs/base.ml";; -needs "proofs/mlkem_specs.ml";; -needs "proofs/mlkem_utils.ml";; -needs "proofs/mlkem_zetas.ml";; +needs "common/mlkem_specs.ml";; +needs "arm/proofs/mlkem_utils.ml";; +needs "arm/proofs/mlkem_zetas.ml";; -(**** print_literal_from_elf "mlkem/mlkem_intt.o";; +(**** print_literal_from_elf "arm/mlkem/mlkem_intt.o";; ****) let mlkem_intt_mc = define_assert_from_elf - "mlkem_intt_mc" "mlkem/mlkem_intt.o" + "mlkem_intt_mc" "arm/mlkem/mlkem_intt.o" (*** BYTECODE START ***) [ 0xd10103ff; (* arm_SUB SP SP (rvalue (word 64)) *) diff --git a/proofs/hol_light/arm/proofs/mlkem_ntt.ml b/proofs/hol_light/arm/proofs/mlkem_ntt.ml index 8998c1a26d..8c5c7a6506 100644 --- a/proofs/hol_light/arm/proofs/mlkem_ntt.ml +++ b/proofs/hol_light/arm/proofs/mlkem_ntt.ml @@ -9,15 +9,15 @@ needs "arm/proofs/base.ml";; -needs "proofs/mlkem_specs.ml";; -needs "proofs/mlkem_utils.ml";; -needs "proofs/mlkem_zetas.ml";; +needs "common/mlkem_specs.ml";; +needs "arm/proofs/mlkem_utils.ml";; +needs "arm/proofs/mlkem_zetas.ml";; -(**** print_literal_from_elf "mlkem/mlkem_ntt.o";; +(**** print_literal_from_elf "arm/mlkem/mlkem_ntt.o";; ****) let mlkem_ntt_mc = define_assert_from_elf - "mlkem_ntt_mc" "mlkem/mlkem_ntt.o" + "mlkem_ntt_mc" "arm/mlkem/mlkem_ntt.o" (*** BYTECODE START ***) [ 0xd10103ff; (* arm_SUB SP SP (rvalue (word 64)) *) diff --git a/proofs/hol_light/arm/proofs/mlkem_poly_basemul_acc_montgomery_cached_k2.ml b/proofs/hol_light/arm/proofs/mlkem_poly_basemul_acc_montgomery_cached_k2.ml index a7d981a390..940c0867a8 100644 --- a/proofs/hol_light/arm/proofs/mlkem_poly_basemul_acc_montgomery_cached_k2.ml +++ b/proofs/hol_light/arm/proofs/mlkem_poly_basemul_acc_montgomery_cached_k2.ml @@ -5,15 +5,15 @@ needs "arm/proofs/base.ml";; -needs "proofs/mlkem_specs.ml";; -needs "proofs/mlkem_utils.ml";; +needs "common/mlkem_specs.ml";; +needs "arm/proofs/mlkem_utils.ml";; -(**** print_literal_from_elf "mlkem/mlkem_poly_basemul_acc_montgomery_cached_k2.o";; +(**** print_literal_from_elf "arm/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k2.o";; ****) let poly_basemul_acc_montgomery_cached_k2_mc = define_assert_from_elf - "poly_basemul_acc_montgomery_cached_k2_mc" "mlkem/mlkem_poly_basemul_acc_montgomery_cached_k2.o" + "poly_basemul_acc_montgomery_cached_k2_mc" "arm/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k2.o" (*** BYTECODE START ***) [ 0xd10103ff; (* arm_SUB SP SP (rvalue (word 64)) *) diff --git a/proofs/hol_light/arm/proofs/mlkem_poly_basemul_acc_montgomery_cached_k3.ml b/proofs/hol_light/arm/proofs/mlkem_poly_basemul_acc_montgomery_cached_k3.ml index a781ff6f51..921be72ab3 100644 --- a/proofs/hol_light/arm/proofs/mlkem_poly_basemul_acc_montgomery_cached_k3.ml +++ b/proofs/hol_light/arm/proofs/mlkem_poly_basemul_acc_montgomery_cached_k3.ml @@ -5,15 +5,15 @@ needs "arm/proofs/base.ml";; -needs "proofs/mlkem_specs.ml";; -needs "proofs/mlkem_utils.ml";; +needs "common/mlkem_specs.ml";; +needs "arm/proofs/mlkem_utils.ml";; -(**** print_literal_from_elf "mlkem/mlkem_poly_basemul_acc_montgomery_cached_k3.o";; +(**** print_literal_from_elf "arm/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k3.o";; ****) let poly_basemul_acc_montgomery_cached_k3_mc = define_assert_from_elf - "poly_basemul_acc_montgomery_cached_k3_mc" "mlkem/mlkem_poly_basemul_acc_montgomery_cached_k3.o" + "poly_basemul_acc_montgomery_cached_k3_mc" "arm/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k3.o" (*** BYTECODE START ***) [ 0xd10103ff; (* arm_SUB SP SP (rvalue (word 64)) *) diff --git a/proofs/hol_light/arm/proofs/mlkem_poly_basemul_acc_montgomery_cached_k4.ml b/proofs/hol_light/arm/proofs/mlkem_poly_basemul_acc_montgomery_cached_k4.ml index 6418e51ec2..0c342d1618 100644 --- a/proofs/hol_light/arm/proofs/mlkem_poly_basemul_acc_montgomery_cached_k4.ml +++ b/proofs/hol_light/arm/proofs/mlkem_poly_basemul_acc_montgomery_cached_k4.ml @@ -5,14 +5,14 @@ needs "arm/proofs/base.ml";; -needs "proofs/mlkem_specs.ml";; -needs "proofs/mlkem_utils.ml";; +needs "common/mlkem_specs.ml";; +needs "arm/proofs/mlkem_utils.ml";; -(**** print_literal_from_elf "mlkem/mlkem_poly_basemul_acc_montgomery_cached_k4.o";; +(**** print_literal_from_elf "arm/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k4.o";; ****) let poly_basemul_acc_montgomery_cached_k4_mc = define_assert_from_elf - "poly_basemul_acc_montgomery_cached_k4_mc" "mlkem/mlkem_poly_basemul_acc_montgomery_cached_k4.o" + "poly_basemul_acc_montgomery_cached_k4_mc" "arm/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k4.o" (*** BYTECODE START ***) [ 0xd10103ff; (* arm_SUB SP SP (rvalue (word 64)) *) diff --git a/proofs/hol_light/arm/proofs/mlkem_poly_mulcache_compute.ml b/proofs/hol_light/arm/proofs/mlkem_poly_mulcache_compute.ml index b696e8497d..2d4a4edf48 100644 --- a/proofs/hol_light/arm/proofs/mlkem_poly_mulcache_compute.ml +++ b/proofs/hol_light/arm/proofs/mlkem_poly_mulcache_compute.ml @@ -5,16 +5,16 @@ needs "arm/proofs/base.ml";; -needs "proofs/mlkem_specs.ml";; -needs "proofs/mlkem_utils.ml";; -needs "proofs/mlkem_zetas.ml";; +needs "common/mlkem_specs.ml";; +needs "arm/proofs/mlkem_utils.ml";; +needs "arm/proofs/mlkem_zetas.ml";; (**** print_literal_from_elf "mlkem/poly_mulcache_compute.o";; ****) let poly_mulcache_compute_mc = define_assert_from_elf - "poly_mulcache_compute_mc" "mlkem/mlkem_poly_mulcache_compute.o" + "poly_mulcache_compute_mc" "arm/mlkem/mlkem_poly_mulcache_compute.o" (*** BYTECODE START ***) [ 0x5281a025; (* arm_MOV W5 (rvalue (word 3329)) *) diff --git a/proofs/hol_light/arm/proofs/mlkem_poly_reduce.ml b/proofs/hol_light/arm/proofs/mlkem_poly_reduce.ml index 81e6dbc255..c43729d44d 100644 --- a/proofs/hol_light/arm/proofs/mlkem_poly_reduce.ml +++ b/proofs/hol_light/arm/proofs/mlkem_poly_reduce.ml @@ -9,14 +9,14 @@ needs "arm/proofs/base.ml";; -needs "proofs/mlkem_specs.ml";; -needs "proofs/mlkem_utils.ml";; +needs "common/mlkem_specs.ml";; +needs "arm/proofs/mlkem_utils.ml";; -(**** print_literal_from_elf "mlkem/mlkem_poly_reduce.o";; +(**** print_literal_from_elf "arm/mlkem/mlkem_poly_reduce.o";; ****) let mlkem_poly_reduce_mc = define_assert_from_elf - "mlkem_poly_reduce_mc" "mlkem/mlkem_poly_reduce.o" + "mlkem_poly_reduce_mc" "arm/mlkem/mlkem_poly_reduce.o" (*** BYTECODE START ***) [ 0x5281a022; (* arm_MOV W2 (rvalue (word 3329)) *) diff --git a/proofs/hol_light/arm/proofs/mlkem_poly_tobytes.ml b/proofs/hol_light/arm/proofs/mlkem_poly_tobytes.ml index 110770ae86..289a35cc74 100644 --- a/proofs/hol_light/arm/proofs/mlkem_poly_tobytes.ml +++ b/proofs/hol_light/arm/proofs/mlkem_poly_tobytes.ml @@ -9,14 +9,14 @@ needs "arm/proofs/base.ml";; -needs "proofs/mlkem_specs.ml";; -needs "proofs/mlkem_utils.ml";; +needs "common/mlkem_specs.ml";; +needs "arm/proofs/mlkem_utils.ml";; -(**** print_literal_from_elf "mlkem/mlkem_poly_tobytes.o";; +(**** print_literal_from_elf "arm/mlkem/mlkem_poly_tobytes.o";; ****) let mlkem_poly_tobytes_mc = define_assert_from_elf - "mlkem_poly_tobytes_mc" "mlkem/mlkem_poly_tobytes.o" + "mlkem_poly_tobytes_mc" "arm/mlkem/mlkem_poly_tobytes.o" (*** BYTECODE START ***) [ 0xd2800202; (* arm_MOV X2 (rvalue (word 16)) *) diff --git a/proofs/hol_light/arm/proofs/mlkem_poly_tomont.ml b/proofs/hol_light/arm/proofs/mlkem_poly_tomont.ml index e922395757..a3638352b0 100644 --- a/proofs/hol_light/arm/proofs/mlkem_poly_tomont.ml +++ b/proofs/hol_light/arm/proofs/mlkem_poly_tomont.ml @@ -5,14 +5,14 @@ needs "arm/proofs/base.ml";; -needs "proofs/mlkem_specs.ml";; -needs "proofs/mlkem_utils.ml";; +needs "common/mlkem_specs.ml";; +needs "arm/proofs/mlkem_utils.ml";; -(**** print_literal_from_elf "mlkem/mlkem_poly_tomont.o";; +(**** print_literal_from_elf "arm/mlkem/mlkem_poly_tomont.o";; ****) let poly_tomont_asm_mc = define_assert_from_elf - "poly_tomont_asm_mc" "mlkem/mlkem_poly_tomont.o" + "poly_tomont_asm_mc" "arm/mlkem/mlkem_poly_tomont.o" (*** BYTECODE START ***) [ 0x5281a022; (* arm_MOV W2 (rvalue (word 3329)) *) diff --git a/proofs/hol_light/arm/proofs/mlkem_rej_uniform.ml b/proofs/hol_light/arm/proofs/mlkem_rej_uniform.ml index 595f54c315..b929cca3dd 100644 --- a/proofs/hol_light/arm/proofs/mlkem_rej_uniform.ml +++ b/proofs/hol_light/arm/proofs/mlkem_rej_uniform.ml @@ -9,15 +9,15 @@ needs "arm/proofs/base.ml";; -needs "proofs/mlkem_specs.ml";; -needs "proofs/mlkem_utils.ml";; -needs "proofs/mlkem_rej_uniform_table.ml";; +needs "common/mlkem_specs.ml";; +needs "arm/proofs/mlkem_utils.ml";; +needs "arm/proofs/mlkem_rej_uniform_table.ml";; -(**** print_literal_from_elf "mlkem/mlkem_rej_uniform.o";; +(**** print_literal_from_elf "arm/mlkem/mlkem_rej_uniform.o";; ****) let mlkem_rej_uniform_mc = define_assert_from_elf - "mlkem_rej_uniform_mc" "mlkem/mlkem_rej_uniform.o" + "mlkem_rej_uniform_mc" "arm/mlkem/mlkem_rej_uniform.o" (*** BYTECODE START ***) [ 0xd10903ff; (* arm_SUB SP SP (rvalue (word 576)) *) diff --git a/proofs/hol_light/arm/proofs/mlkem_specs.ml b/proofs/hol_light/arm/proofs/mlkem_specs.ml deleted file mode 100644 index 6c32dc0407..0000000000 --- a/proofs/hol_light/arm/proofs/mlkem_specs.ml +++ /dev/null @@ -1,470 +0,0 @@ -(* - * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 - *) - -(* ========================================================================= *) -(* Common specifications and tactics for ML-KEM, mainly related to the NTT - * and the base multiplication. *) -(* ========================================================================= *) - -needs "Library/words.ml";; -needs "Library/isum.ml";; - -(* ------------------------------------------------------------------------- *) -(* The pure forms of forward and inverse NTT with no reordering. *) -(* ------------------------------------------------------------------------- *) - -let pure_forward_ntt = define - `pure_forward_ntt f k = - isum (0..127) (\j. f(2 * j + k MOD 2) * - &17 pow ((2 * k DIV 2 + 1) * j)) - rem &3329`;; - -let pure_inverse_ntt = define - `pure_inverse_ntt f k = - (&3303 * isum (0..127) (\j. f(2 * j + k MOD 2) * - &1175 pow ((2 * j + 1) * k DIV 2))) - rem &3329`;; - -(* ------------------------------------------------------------------------- *) -(* Bit-reversing order as used in the standard/default order. *) -(* ------------------------------------------------------------------------- *) - -let bitreverse7 = define - `bitreverse7(n) = val(word_reversefields 1 (word n:7 word))`;; - -let bitreverse_pairs = define - `bitreverse_pairs i = 2 * bitreverse7 (i DIV 2) + i MOD 2`;; - -let reorder = define - `reorder p (a:num->int) = \i. a(p i)`;; - -(* ------------------------------------------------------------------------- *) -(* Conversion of each element of an array to Montgomery form with B = 2^16. *) -(* ------------------------------------------------------------------------- *) - -let tomont_3329 = define - `tomont_3329 (a:num->int) = \i. (&2 pow 16 * a i) rem &3329`;; - -(* ------------------------------------------------------------------------- *) -(* The multiplication cache for fast base multiplication *) -(* ------------------------------------------------------------------------- *) -let mulcache = define - `mulcache f k = - (f (2 * k + 1) * (&17 pow (2 * (bitreverse7 k) + 1))) rem &3329`;; - -(* ------------------------------------------------------------------------- *) -(* The precise specs of the actual ARM code. *) -(* ------------------------------------------------------------------------- *) - -let inverse_ntt = define - `inverse_ntt f k = - (&512 * isum (0..127) - (\j. f(2 * bitreverse7 j + k MOD 2) * - &1175 pow ((2 * j + 1) * k DIV 2))) - rem &3329`;; - -let forward_ntt = define - `forward_ntt f k = - isum (0..127) (\j. f(2 * j + k MOD 2) * - &17 pow ((2 * bitreverse7 (k DIV 2) + 1) * j)) - rem &3329`;; - -(* ------------------------------------------------------------------------- *) -(* Show how these relate to the "pure" ones. *) -(* ------------------------------------------------------------------------- *) - -let FORWARD_NTT = prove - (`forward_ntt = reorder bitreverse_pairs o pure_forward_ntt`, - REWRITE_TAC[FUN_EQ_THM; o_DEF; bitreverse_pairs; reorder] THEN - REWRITE_TAC[forward_ntt; pure_forward_ntt] THEN - REWRITE_TAC[ARITH_RULE `(2 * x + i MOD 2) DIV 2 = x`] THEN - REWRITE_TAC[MOD_MULT_ADD; MOD_MOD_REFL]);; - -let INVERSE_NTT = prove - (`inverse_ntt = tomont_3329 o pure_inverse_ntt o reorder bitreverse_pairs`, - REWRITE_TAC[FUN_EQ_THM; o_DEF; bitreverse_pairs; reorder] THEN - REWRITE_TAC[inverse_ntt; pure_inverse_ntt; tomont_3329] THEN - REWRITE_TAC[ARITH_RULE `(2 * x + i MOD 2) DIV 2 = x`] THEN - REWRITE_TAC[MOD_MULT_ADD; MOD_MOD_REFL] THEN - MAP_EVERY X_GEN_TAC [`a:num->int`; `i:num`] THEN - CONV_TAC INT_REM_DOWN_CONV THEN REWRITE_TAC[INT_MUL_ASSOC] THEN - ONCE_REWRITE_TAC[GSYM INT_MUL_REM] THEN CONV_TAC INT_REDUCE_CONV);; - -(* ------------------------------------------------------------------------- *) -(* Explicit computation rules to evaluate mod-3329 powers/sums less naively. *) -(* ------------------------------------------------------------------------- *) - -let BITREVERSE7_CLAUSES = end_itlist CONJ (map - (GEN_REWRITE_CONV I [bitreverse7] THENC DEPTH_CONV WORD_NUM_RED_CONV) - (map (curry mk_comb `bitreverse7` o mk_small_numeral) (0--127)));; - -let FORWARD_NTT_ALT = prove - (`forward_ntt f k = - isum (0..127) - (\j. f(2 * j + k MOD 2) * - (&17 pow ((2 * bitreverse7 (k DIV 2) + 1) * j)) rem &3329) - rem &3329`, - REWRITE_TAC[forward_ntt] THEN MATCH_MP_TAC - (REWRITE_RULE[] (ISPEC - `(\x y. x rem &3329 = y rem &3329)` ISUM_RELATED)) THEN - REWRITE_TAC[INT_REM_EQ; FINITE_NUMSEG; INT_CONG_ADD] THEN - X_GEN_TAC `i:num` THEN DISCH_TAC THEN - REWRITE_TAC[GSYM INT_OF_NUM_REM; GSYM INT_OF_NUM_CLAUSES; - GSYM INT_REM_EQ] THEN - CONV_TAC INT_REM_DOWN_CONV THEN - AP_THM_TAC THEN AP_TERM_TAC THEN CONV_TAC INT_ARITH);; - -let INVERSE_NTT_ALT = prove - (`inverse_ntt f k = - isum (0..127) - (\j. f(2 * bitreverse7 j + k MOD 2) * - (&512 * - (&1175 pow ((2 * j + 1) * k DIV 2)) rem &3329) - rem &3329) rem &3329`, - REWRITE_TAC[inverse_ntt; GSYM ISUM_LMUL] THEN MATCH_MP_TAC - (REWRITE_RULE[] (ISPEC - `(\x y. x rem &3329 = y rem &3329)` ISUM_RELATED)) THEN - REWRITE_TAC[INT_REM_EQ; FINITE_NUMSEG; INT_CONG_ADD] THEN - X_GEN_TAC `i:num` THEN DISCH_TAC THEN - REWRITE_TAC[GSYM INT_OF_NUM_REM; GSYM INT_OF_NUM_CLAUSES; - GSYM INT_REM_EQ] THEN - CONV_TAC INT_REM_DOWN_CONV THEN - AP_THM_TAC THEN AP_TERM_TAC THEN CONV_TAC INT_ARITH);; - -let FORWARD_NTT_CONV = - GEN_REWRITE_CONV I [FORWARD_NTT_ALT] THENC - LAND_CONV EXPAND_ISUM_CONV THENC - DEPTH_CONV NUM_RED_CONV THENC - GEN_REWRITE_CONV ONCE_DEPTH_CONV [BITREVERSE7_CLAUSES] THENC - DEPTH_CONV NUM_RED_CONV THENC - GEN_REWRITE_CONV DEPTH_CONV [INT_OF_NUM_POW; INT_OF_NUM_REM] THENC - ONCE_DEPTH_CONV EXP_MOD_CONV THENC INT_REDUCE_CONV;; - -let INVERSE_NTT_CONV = - GEN_REWRITE_CONV I [INVERSE_NTT_ALT] THENC - LAND_CONV EXPAND_ISUM_CONV THENC - DEPTH_CONV NUM_RED_CONV THENC - GEN_REWRITE_CONV ONCE_DEPTH_CONV [BITREVERSE7_CLAUSES] THENC - DEPTH_CONV NUM_RED_CONV THENC - GEN_REWRITE_CONV DEPTH_CONV [INT_OF_NUM_POW; INT_OF_NUM_REM] THENC - ONCE_DEPTH_CONV EXP_MOD_CONV THENC INT_REDUCE_CONV;; - -(* ------------------------------------------------------------------------- *) -(* Abbreviate the Barrett reduction and multiplication and Montgomery *) -(* reduction patterns in the code. *) -(* ------------------------------------------------------------------------- *) - -let barred = define - `(barred:int16->int16) x = - word_sub x - (word_mul - (iword - ((ival - (iword_saturate((&2 * ival x * &20159) div &65536):int16) + &1024) div - &2048)) - (word 3329))`;; - -let barmul = define - `barmul (k,b) (a:int16):int16 = - word_sub (word_mul a b) - (word_mul (iword_saturate((&2 * ival a * k + &32768) div &65536)) - (word 3329))`;; - -let montred = define - `(montred:int32->int16) x = - word_subword - (word_add - (word_mul ((word_sx:int16->int32) - (word_mul (word_subword x (0,16)) (word 3327))) - (word 3329)) - x) - (16,16)`;; - -(* ------------------------------------------------------------------------- *) -(* Congruence-and-bound propagation, just recursion on the expression tree. *) -(* ------------------------------------------------------------------------- *) - -let CONGBOUND_ATOM = prove - (`!x:N word. (ival x == ival x) (mod &3329) /\ - --(&2 pow (dimindex(:N) - 1)) <= ival x /\ - ival x <= &2 pow (dimindex(:N) - 1) - &1`, - GEN_TAC THEN REWRITE_TAC[INT_ARITH `x:int <= y - &1 <=> x < y`] THEN - REWRITE_TAC[IVAL_BOUND] THEN INTEGER_TAC);; - -let CONGBOUND_ATOM_GEN = prove - (`!x:N word. abs(ival x) <= n - ==> (ival x == ival x) (mod &3329) /\ --n <= ival x /\ ival x <= n`, - REWRITE_TAC[INTEGER_RULE `(x:int == x) (mod n)`] THEN INT_ARITH_TAC);; - -let CONGBOUND_IWORD = prove - (`!x. ((x == x') (mod &3329) /\ l <= x /\ x <= u) - ==> --(&2 pow (dimindex(:N) - 1)) <= l /\ u <= &2 pow (dimindex(:N) - 1) - &1 - ==> (ival(iword x:N word) == x') (mod &3329) /\ - l <= ival(iword x:N word) /\ ival(iword x:N word) <= u`, - GEN_TAC THEN STRIP_TAC THEN STRIP_TAC THEN REWRITE_TAC[word_sx] THEN - W(MP_TAC o PART_MATCH (lhand o rand) IVAL_IWORD o lhand o rand o rand o snd) THEN - ANTS_TAC THENL [ASM_INT_ARITH_TAC; DISCH_THEN SUBST1_TAC] THEN - ASM_REWRITE_TAC[]);; - -let CONGBOUND_WORD_SX = prove - (`!x:M word. - ((ival x == x') (mod &3329) /\ l <= ival x /\ ival x <= u) - ==> --(&2 pow (dimindex(:N) - 1)) <= l /\ u <= &2 pow (dimindex(:N) - 1) - &1 - ==> (ival(word_sx x:N word) == x') (mod &3329) /\ - l <= ival(word_sx x:N word) /\ ival(word_sx x:N word) <= u`, - REWRITE_TAC[word_sx; CONGBOUND_IWORD]);; - -let CONGBOUND_WORD_ADD = prove - (`!x y:N word. - ((ival x == x') (mod &3329) /\ lx <= ival x /\ ival x <= ux) /\ - ((ival y == y') (mod &3329) /\ ly <= ival y /\ ival y <= uy) - ==> --(&2 pow (dimindex(:N) - 1)) <= lx + ly /\ - ux + uy <= &2 pow (dimindex(:N) - 1) - &1 - ==> (ival(word_add x y) == x' + y') (mod &3329) /\ - lx + ly <= ival(word_add x y) /\ - ival(word_add x y) <= ux + uy`, - REPEAT GEN_TAC THEN REWRITE_TAC[WORD_ADD_IMODULAR; imodular] THEN - STRIP_TAC THEN STRIP_TAC THEN - MATCH_MP_TAC(REWRITE_RULE[IMP_IMP] CONGBOUND_IWORD) THEN - ASM_SIMP_TAC[INT_CONG_ADD] THEN ASM_INT_ARITH_TAC);; - -let CONGBOUND_WORD_SUB = prove - (`!x y:N word. - ((ival x == x') (mod &3329) /\ lx <= ival x /\ ival x <= ux) /\ - ((ival y == y') (mod &3329) /\ ly <= ival y /\ ival y <= uy) - ==> --(&2 pow (dimindex(:N) - 1)) <= lx - uy /\ - ux - ly <= &2 pow (dimindex(:N) - 1) - &1 - ==> (ival(word_sub x y) == x' - y') (mod &3329) /\ - lx - uy <= ival(word_sub x y) /\ - ival(word_sub x y) <= ux - ly`, - REPEAT GEN_TAC THEN REWRITE_TAC[WORD_SUB_IMODULAR; imodular] THEN - STRIP_TAC THEN STRIP_TAC THEN - MATCH_MP_TAC(REWRITE_RULE[IMP_IMP] CONGBOUND_IWORD) THEN - ASM_SIMP_TAC[INT_CONG_SUB] THEN ASM_INT_ARITH_TAC);; - -let CONGBOUND_WORD_MUL = prove - (`!x y:N word. - ((ival x == x') (mod &3329) /\ lx <= ival x /\ ival x <= ux) /\ - ((ival y == y') (mod &3329) /\ ly <= ival y /\ ival y <= uy) - ==> --(&2 pow (dimindex(:N) - 1)) - <= min (lx * ly) (min (lx * uy) (min (ux * ly) (ux * uy))) /\ - max (lx * ly) (max (lx * uy) (max (ux * ly) (ux * uy))) - <= &2 pow (dimindex(:N) - 1) - &1 - ==> (ival(word_mul x y) == x' * y') (mod &3329) /\ - min (lx * ly) (min (lx * uy) (min (ux * ly) (ux * uy))) - <= ival(word_mul x y) /\ - ival(word_mul x y) - <= max (lx * ly) (max (lx * uy) (max (ux * ly) (ux * uy)))`, - let lemma = prove - (`l:int <= x /\ x <= u - ==> !a. a * l <= a * x /\ a * x <= a * u \/ - a * u <= a * x /\ a * x <= a * l`, - MESON_TAC[INT_LE_NEGTOTAL; INT_LE_LMUL; - INT_ARITH `a * x:int <= a * y <=> --a * y <= --a * x`]) in - REPEAT GEN_TAC THEN - DISCH_THEN(CONJUNCTS_THEN(CONJUNCTS_THEN2 ASSUME_TAC MP_TAC)) THEN - DISCH_THEN(ASSUME_TAC o SPEC `ival(x:N word)` o MATCH_MP lemma) THEN - DISCH_THEN(MP_TAC o MATCH_MP lemma) THEN DISCH_THEN(fun th -> - ASSUME_TAC(SPEC `ly:int` th) THEN ASSUME_TAC(SPEC `uy:int` th)) THEN - REWRITE_TAC[WORD_MUL_IMODULAR; imodular] THEN STRIP_TAC THEN - MATCH_MP_TAC(REWRITE_RULE[IMP_IMP] CONGBOUND_IWORD) THEN - ASM_SIMP_TAC[INT_CONG_MUL] THEN ASM_INT_ARITH_TAC);; - -let CONGBOUND_BARRED = prove - (`!a a' l u. - ((ival a == a') (mod &3329) /\ l <= ival a /\ ival a <= u) - ==> (ival(barred a) == a') (mod &3329) /\ - -- &1664 <= ival(barred a) /\ ival(barred a) <= &1664`, - REPEAT GEN_TAC THEN STRIP_TAC THEN REWRITE_TAC[barred] THEN - REWRITE_TAC[iword_saturate; word_INT_MIN; word_INT_MAX; DIMINDEX_16] THEN - CONV_TAC(DEPTH_CONV WORD_NUM_RED_CONV) THEN - REPEAT(COND_CASES_TAC THENL - [FIRST_X_ASSUM(MATCH_MP_TAC o MATCH_MP (MESON[] `p ==> ~p ==> q`)) THEN - REWRITE_TAC[INT_GT; INT_NOT_LT] THEN BOUNDER_TAC[]; - ASM_REWRITE_TAC[]]) THEN - REWRITE_TAC[WORD_RULE - `word_sub a (word_mul b (word n)) = iword(ival a - ival b * &n)`] THEN - REPEAT(W(fun (asl,w) -> - let t = hd(sort free_in - (find_terms (can (term_match [] `ival(iword x)`)) w)) in - let th = PART_MATCH (lhand o rand) IVAL_IWORD t in - MP_TAC th THEN REWRITE_TAC[DIMINDEX_16] THEN - CONV_TAC NUM_REDUCE_CONV THEN - ANTS_TAC THENL [BOUNDER_TAC[]; DISCH_THEN SUBST1_TAC])) THEN - MATCH_MP_TAC(MESON[] - `(x == k) (mod n) /\ - (a <= x /\ x <= b) /\ - (a <= x /\ x <= b ==> ival(iword x:int16) = x) - ==> (ival(iword x:int16) == k) (mod n) /\ - a <= ival(iword x:int16) /\ ival(iword x:int16) <= b`) THEN - ASM_REWRITE_TAC[INTEGER_RULE - `(a - x * n:int == a') (mod n) <=> (a == a') (mod n)`] THEN - CONJ_TAC THENL - [MP_TAC(ISPEC `a:int16` IVAL_BOUND); - REPEAT STRIP_TAC THEN MATCH_MP_TAC IVAL_IWORD] THEN - REWRITE_TAC[DIMINDEX_16; ARITH] THEN ASM_INT_ARITH_TAC);; - -let CONGBOUND_BARMUL = prove - (`!a a' l u. - ((ival a == a') (mod &3329) /\ l <= ival a /\ ival a <= u) - ==> !k b. abs(k) <= &32767 /\ - (max (abs l) (abs u) * - abs(&65536 * ival b - &6658 * k) + &109150207) div &65536 - <= &32767 - ==> (ival(barmul(k,b) a) == a' * ival b) (mod &3329) /\ - --(max (abs l) (abs u) * - abs(&65536 * ival b - &6658 * k) + &109084672) - div &65536 - <= ival(barmul(k,b) a) /\ - ival(barmul(k,b) a) <= - (max (abs l) (abs u) * abs(&65536 * ival b - &6658 * k) + - &109150207) div &65536`, - REPEAT GEN_TAC THEN STRIP_TAC THEN REWRITE_TAC[INT_ABS_BOUNDS] THEN - REPEAT GEN_TAC THEN STRIP_TAC THEN REWRITE_TAC[barmul] THEN - REWRITE_TAC[iword_saturate; word_INT_MIN; word_INT_MAX; DIMINDEX_16] THEN - CONV_TAC(DEPTH_CONV WORD_NUM_RED_CONV) THEN - REPEAT(COND_CASES_TAC THENL - [FIRST_X_ASSUM(MATCH_MP_TAC o MATCH_MP (MESON[] `p ==> ~p ==> q`)) THEN - REWRITE_TAC[INT_GT; INT_NOT_LT] THEN ASM BOUNDER_TAC[]; - ASM_REWRITE_TAC[]]) THEN - REWRITE_TAC[WORD_RULE - `word_sub (word_mul a b) (word_mul (iword k) (word c)) = - iword(ival a * ival b - &c * k)`] THEN - MATCH_MP_TAC(MESON[] - `(x == k) (mod n) /\ - (a <= x /\ x <= b ==> ival(iword x:int16) = x) /\ - (a <= x /\ x <= b) - ==> (ival(iword x:int16) == k) (mod n) /\ - a <= ival(iword x:int16) /\ ival(iword x:int16) <= b`) THEN - ASM_SIMP_TAC[INTEGER_RULE - `(a:int == a') (mod n) ==> (a * b - n * c == a' * b) (mod n)`] THEN - CONJ_TAC THENL - [REPEAT STRIP_TAC THEN MATCH_MP_TAC IVAL_IWORD THEN - REWRITE_TAC[DIMINDEX_16; ARITH] THEN ASM_INT_ARITH_TAC; - ALL_TAC] THEN - MATCH_MP_TAC(INT_ARITH - `&65536 * l + &109084672 <= a * (&65536 * b - &6658 * k) /\ - a * (&65536 * b - &6658 * k) <= &65536 * u - &109084672 - ==> l <= a * b - &3329 * (&2 * a * k + &32768) div &65536 /\ - a * b - &3329 * (&2 * a * k + &32768) div &65536 <= u`) THEN - CONJ_TAC THENL - [MATCH_MP_TAC(INT_ARITH `abs(y):int <= --x ==> x <= y`); - MATCH_MP_TAC(INT_ARITH `abs(y):int <= x ==> y <= x`)] THEN - REWRITE_TAC[INT_ABS_MUL] THEN - TRANS_TAC INT_LE_TRANS - `max (abs l) (abs u) * abs(&65536 * ival(b:int16) - &6658 * k)` THEN - ASM_SIMP_TAC[INT_LE_RMUL; INT_ABS_POS; INT_ARITH - `l:int <= x /\ x <= u ==> abs x <= max (abs l) (abs u)`] THEN - CONV_TAC INT_ARITH);; - -let MONTRED_LEMMA = prove - (`!x. &2 pow 16 * ival(montred x) = - ival(word_add - (word_mul (word_sx(iword(ival x * &3327):int16)) (word 3329)) x)`, - GEN_TAC THEN REWRITE_TAC[montred] THEN REWRITE_TAC[WORD_BLAST - `word_subword (x:int32) (0,16):int16 = word_sx x`] THEN - REWRITE_TAC[IWORD_INT_MUL; GSYM word_sx; GSYM WORD_IWORD] THEN - REWRITE_TAC[WORD_BLAST `(word_sx:int32->int16) x = word_zx x`] THEN - CONV_TAC INT_REDUCE_CONV THEN MATCH_MP_TAC(BITBLAST_RULE - `word_and x (word 65535):int32 = word 0 - ==> &65536 * ival(word_subword x (16,16):int16) = ival x`) THEN - REWRITE_TAC[BITBLAST_RULE - `word_and x (word 65535):int32 = word 0 <=> word_zx x:int16 = word 0`] THEN - W(MP_TAC o PART_MATCH (lhand o rand) WORD_ZX_ADD o lhand o snd) THEN - REWRITE_TAC[DIMINDEX_16; DIMINDEX_32; ARITH] THEN DISCH_THEN SUBST1_TAC THEN - W(MP_TAC o PART_MATCH (lhand o rand) WORD_ZX_MUL o lhand o lhand o snd) THEN - REWRITE_TAC[DIMINDEX_16; DIMINDEX_32; ARITH] THEN DISCH_THEN SUBST1_TAC THEN - REWRITE_TAC[WORD_BLAST `word_zx(word_sx (x:int16):int32) = x`] THEN - REWRITE_TAC[GSYM VAL_EQ_0; VAL_WORD_ADD; VAL_WORD_MUL; VAL_WORD] THEN - CONV_TAC MOD_DOWN_CONV THEN REWRITE_TAC[GSYM DIVIDES_MOD; DIMINDEX_16] THEN - CONV_TAC WORD_REDUCE_CONV THEN MATCH_MP_TAC(NUMBER_RULE - `(a * b + 1 == 0) (mod d) ==> d divides ((x * a) * b + x)`) THEN - REWRITE_TAC[CONG] THEN ARITH_TAC);; - -let CONGBOUND_MONTRED = prove - (`!a a' l u. - (ival a == a') (mod &3329) /\ l <= ival a /\ ival a <= u - ==> --(&2038398976) <= l /\ u <= &2038402304 - ==> (ival(montred a) == &(inverse_mod 3329 65536) * a') (mod &3329) /\ - (l - &109084672) div &2 pow 16 <= ival(montred a) /\ - ival(montred a) <= &1 + (u + &109081343) div &2 pow 16`, - REPEAT GEN_TAC THEN STRIP_TAC THEN STRIP_TAC THEN - CONV_TAC NUM_REDUCE_CONV THEN CONV_TAC(ONCE_DEPTH_CONV INVERSE_MOD_CONV) THEN - MP_TAC(SPECL [`&169:int`; `(&2:int) pow 16`; `&3329:int`] (INTEGER_RULE - `!d e n:int. (e * d == &1) (mod n) - ==> !x y. ((x == d * y) (mod n) <=> (e * x == y) (mod n))`)) THEN - ANTS_TAC THENL - [REWRITE_TAC[GSYM INT_REM_EQ] THEN INT_ARITH_TAC; - DISCH_THEN(fun th -> REWRITE_TAC[th])] THEN - ONCE_REWRITE_TAC[INT_ARITH - `l:int <= x <=> &2 pow 16 * l <= &2 pow 16 * x`] THEN - REWRITE_TAC[MONTRED_LEMMA] THEN - REWRITE_TAC[WORD_RULE - `word_add (word_mul a b) c = iword(ival a * ival b + ival c)`] THEN - ASM_SIMP_TAC[IVAL_WORD_SX; DIMINDEX_16; DIMINDEX_32; ARITH] THEN - W(MP_TAC o PART_MATCH (lhand o rand) IVAL_IWORD o - lhand o rator o lhand o snd) THEN - REWRITE_TAC[DIMINDEX_32] THEN CONV_TAC(DEPTH_CONV WORD_NUM_RED_CONV) THEN - W(MP_TAC o C ISPEC IVAL_BOUND o - rand o funpow 3 lhand o rand o lhand o lhand o snd) THEN - REWRITE_TAC[DIMINDEX_16; ARITH] THEN STRIP_TAC THEN - ANTS_TAC THENL [ASM_INT_ARITH_TAC; DISCH_THEN SUBST1_TAC] THEN - ASM_REWRITE_TAC[INTEGER_RULE - `(a * p + x:int == y) (mod p) <=> (x == y) (mod p)`] THEN - ASM_INT_ARITH_TAC);; - -let DIMINDEX_INT_REDUCE_CONV = - DEPTH_CONV(WORD_NUM_RED_CONV ORELSEC DIMINDEX_CONV) THENC - INT_REDUCE_CONV;; - -let CONCL_BOUNDS_RULE = - CONV_RULE(BINOP2_CONV - (LAND_CONV(RAND_CONV DIMINDEX_INT_REDUCE_CONV)) - (BINOP2_CONV - (LAND_CONV DIMINDEX_INT_REDUCE_CONV) - (RAND_CONV DIMINDEX_INT_REDUCE_CONV)));; - -let SIDE_ELIM_RULE th = - MP th (EQT_ELIM(DIMINDEX_INT_REDUCE_CONV(lhand(concl th))));; - -let rec GEN_CONGBOUND_RULE aboths tm = - match tm with - Comb(Comb(Const("barmul",_),kb),t) -> - let ktm,btm = dest_pair kb and th0 = GEN_CONGBOUND_RULE aboths t in - let th1 = SPECL [ktm;btm] (MATCH_MP CONGBOUND_BARMUL th0) in - CONCL_BOUNDS_RULE(SIDE_ELIM_RULE th1) - | Comb(Const("barred",_),t) -> - let th1 = GEN_CONGBOUND_RULE aboths t in - MATCH_MP CONGBOUND_BARRED th1 - | Comb(Const("montred",_),t) -> - let th1 = GEN_CONGBOUND_RULE aboths t in - CONCL_BOUNDS_RULE(SIDE_ELIM_RULE(MATCH_MP CONGBOUND_MONTRED th1)) - | Comb(Const("word_sx",_),t) -> - let th0 = GEN_CONGBOUND_RULE aboths t in - let tyin = type_match - (type_of(rator(rand(lhand(funpow 4 rand (snd(dest_forall - (concl CONGBOUND_WORD_SX)))))))) (type_of(rator tm)) [] in - let th1 = MATCH_MP (INST_TYPE tyin CONGBOUND_WORD_SX) th0 in - CONCL_BOUNDS_RULE(SIDE_ELIM_RULE th1) - | Comb(Comb(Const("word_add",_),ltm),rtm) -> - let lth = GEN_CONGBOUND_RULE aboths ltm - and rth = GEN_CONGBOUND_RULE aboths rtm in - let th1 = MATCH_MP CONGBOUND_WORD_ADD (CONJ lth rth) in - CONCL_BOUNDS_RULE(SIDE_ELIM_RULE th1) - | Comb(Comb(Const("word_sub",_),ltm),rtm) -> - let lth = GEN_CONGBOUND_RULE aboths ltm - and rth = GEN_CONGBOUND_RULE aboths rtm in - let th1 = MATCH_MP CONGBOUND_WORD_SUB (CONJ lth rth) in - CONCL_BOUNDS_RULE(SIDE_ELIM_RULE th1) - | Comb(Comb(Const("word_mul",_),ltm),rtm) -> - let lth = GEN_CONGBOUND_RULE aboths ltm - and rth = GEN_CONGBOUND_RULE aboths rtm in - let th1 = MATCH_MP CONGBOUND_WORD_MUL (CONJ lth rth) in - CONCL_BOUNDS_RULE(SIDE_ELIM_RULE th1) - | _ -> (try MATCH_MP CONGBOUND_ATOM_GEN - (find ((=) tm o rand o rand o lhand o concl) aboths) - with Failure _ -> CONCL_BOUNDS_RULE(ISPEC tm CONGBOUND_ATOM));; - -let CONGBOUND_RULE = GEN_CONGBOUND_RULE [];; diff --git a/proofs/hol_light/common/mlkem_specs.ml b/proofs/hol_light/common/mlkem_specs.ml new file mode 100644 index 0000000000..15bd2c7ef4 --- /dev/null +++ b/proofs/hol_light/common/mlkem_specs.ml @@ -0,0 +1,952 @@ +(* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + *) + +(* ========================================================================= *) +(* This file is a trimmed down version of s2n-bignum's mlkem_mldsa.ml file. *) +(* Common specifications and tactics for ML-KEM x86 and arm proofs. *) +(* ========================================================================= *) + +needs "Library/words.ml";; +needs "Library/isum.ml";; + +(* ------------------------------------------------------------------------- *) +(* The pure forms of forward and inverse NTT with no reordering. *) +(* ------------------------------------------------------------------------- *) + +let pure_forward_ntt = define + `pure_forward_ntt f k = + isum (0..127) (\j. f(2 * j + k MOD 2) * + &17 pow ((2 * k DIV 2 + 1) * j)) + rem &3329`;; + +let pure_inverse_ntt = define + `pure_inverse_ntt f k = + (&3303 * isum (0..127) (\j. f(2 * j + k MOD 2) * + &1175 pow ((2 * j + 1) * k DIV 2))) + rem &3329`;; + +(* ------------------------------------------------------------------------- *) +(* Bit-reversing order as used in the standard/default order. *) +(* ------------------------------------------------------------------------- *) + +let bitreverse7 = define + `bitreverse7(n) = val(word_reversefields 1 (word n:7 word))`;; + +let bitreverse_pairs = define + `bitreverse_pairs i = 2 * bitreverse7 (i DIV 2) + i MOD 2`;; + +let reorder = define + `reorder p (a:num->int) = \i. a(p i)`;; + +let avx2_ntt_order = define + `avx2_ntt_order i = + bitreverse7(64 * (i DIV 64) + ((i MOD 64) DIV 16) + 4 * (i MOD 16))`;; + +(* ------------------------------------------------------------------------- *) +(* Conversion of each element of an array to Montgomery form with B = 2^16. *) +(* ------------------------------------------------------------------------- *) + +let tomont_3329 = define + `tomont_3329 (a:num->int) = \i. (&2 pow 16 * a i) rem &3329`;; + +let tomont_8380417 = define + `tomont_8380417 (a:num->int) = \i. (&2 pow 32 * a i) rem &8380417`;; + +(* ------------------------------------------------------------------------- *) +(* The multiplication cache for fast base multiplication *) +(* ------------------------------------------------------------------------- *) + +let mulcache = define + `mulcache f k = + (f (2 * k + 1) * (&17 pow (2 * (bitreverse7 k) + 1))) rem &3329`;; + +(* ------------------------------------------------------------------------- *) +(* The precise specs of the actual ARM code. *) +(* ------------------------------------------------------------------------- *) + +let inverse_ntt = define + `inverse_ntt f k = + (&512 * isum (0..127) + (\j. f(2 * bitreverse7 j + k MOD 2) * + &1175 pow ((2 * j + 1) * k DIV 2))) + rem &3329`;; + +let forward_ntt = define + `forward_ntt f k = + isum (0..127) (\j. f(2 * j + k MOD 2) * + &17 pow ((2 * bitreverse7 (k DIV 2) + 1) * j)) + rem &3329`;; + +(* ------------------------------------------------------------------------- *) +(* The precise specs of the actual x86 code. *) +(* ------------------------------------------------------------------------- *) + +let avx2_forward_ntt = define + `avx2_forward_ntt f k = + let r = (k DIV 16) MOD 2 + and q = 16 * (k DIV 32) + k MOD 16 in + isum (0..127) (\j. f(2 * j + r) * + &17 pow ((2 * avx2_ntt_order q + 1) * j)) + rem &3329`;; + +(* ------------------------------------------------------------------------- *) +(* Show how these relate to the "pure" ones. *) +(* ------------------------------------------------------------------------- *) + +let FORWARD_NTT = prove + (`forward_ntt = reorder bitreverse_pairs o pure_forward_ntt`, + REWRITE_TAC[FUN_EQ_THM; o_DEF; bitreverse_pairs; reorder] THEN + REWRITE_TAC[forward_ntt; pure_forward_ntt] THEN + REWRITE_TAC[ARITH_RULE `(2 * x + i MOD 2) DIV 2 = x`] THEN + REWRITE_TAC[MOD_MULT_ADD; MOD_MOD_REFL]);; + +let INVERSE_NTT = prove + (`inverse_ntt = tomont_3329 o pure_inverse_ntt o reorder bitreverse_pairs`, + REWRITE_TAC[FUN_EQ_THM; o_DEF; bitreverse_pairs; reorder] THEN + REWRITE_TAC[inverse_ntt; pure_inverse_ntt; tomont_3329] THEN + REWRITE_TAC[ARITH_RULE `(2 * x + i MOD 2) DIV 2 = x`] THEN + REWRITE_TAC[MOD_MULT_ADD; MOD_MOD_REFL] THEN + MAP_EVERY X_GEN_TAC [`a:num->int`; `i:num`] THEN + CONV_TAC INT_REM_DOWN_CONV THEN REWRITE_TAC[INT_MUL_ASSOC] THEN + ONCE_REWRITE_TAC[GSYM INT_MUL_REM] THEN CONV_TAC INT_REDUCE_CONV);; + +(* ------------------------------------------------------------------------- *) +(* Explicit computation rules to evaluate mod-3329 powers/sums less naively. *) +(* ------------------------------------------------------------------------- *) + +let BITREVERSE7_CLAUSES = end_itlist CONJ (map + (GEN_REWRITE_CONV I [bitreverse7] THENC DEPTH_CONV WORD_NUM_RED_CONV) + (map (curry mk_comb `bitreverse7` o mk_small_numeral) (0--127)));; + +let FORWARD_NTT_ALT = prove + (`forward_ntt f k = + isum (0..127) + (\j. f(2 * j + k MOD 2) * + (&17 pow ((2 * bitreverse7 (k DIV 2) + 1) * j)) rem &3329) + rem &3329`, + REWRITE_TAC[forward_ntt] THEN MATCH_MP_TAC + (REWRITE_RULE[] (ISPEC + `(\x y. x rem &3329 = y rem &3329)` ISUM_RELATED)) THEN + REWRITE_TAC[INT_REM_EQ; FINITE_NUMSEG; INT_CONG_ADD] THEN + X_GEN_TAC `i:num` THEN DISCH_TAC THEN + REWRITE_TAC[GSYM INT_OF_NUM_REM; GSYM INT_OF_NUM_CLAUSES; + GSYM INT_REM_EQ] THEN + CONV_TAC INT_REM_DOWN_CONV THEN + AP_THM_TAC THEN AP_TERM_TAC THEN CONV_TAC INT_ARITH);; + +let AVX2_FORWARD_NTT_ALT = prove + (`avx2_forward_ntt f k = + let r = (k DIV 16) MOD 2 + and q = 16 * (k DIV 32) + k MOD 16 in + isum (0..127) + (\j. f(2 * j + r) * + (&17 pow ((2 * avx2_ntt_order q + 1) * j)) rem &3329) + rem &3329`, + REWRITE_TAC[avx2_forward_ntt] THEN + CONV_TAC(TOP_DEPTH_CONV let_CONV) THEN MATCH_MP_TAC + (REWRITE_RULE[] (ISPEC + `(\x y. x rem &3329 = y rem &3329)` ISUM_RELATED)) THEN + REWRITE_TAC[INT_REM_EQ; FINITE_NUMSEG; INT_CONG_ADD] THEN + X_GEN_TAC `i:num` THEN DISCH_TAC THEN + REWRITE_TAC[GSYM INT_OF_NUM_REM; GSYM INT_OF_NUM_CLAUSES; + GSYM INT_REM_EQ] THEN + CONV_TAC INT_REM_DOWN_CONV THEN + AP_THM_TAC THEN AP_TERM_TAC THEN CONV_TAC INT_ARITH);; + +let INVERSE_NTT_ALT = prove + (`inverse_ntt f k = + isum (0..127) + (\j. f(2 * bitreverse7 j + k MOD 2) * + (&512 * + (&1175 pow ((2 * j + 1) * k DIV 2)) rem &3329) + rem &3329) rem &3329`, + REWRITE_TAC[inverse_ntt; GSYM ISUM_LMUL] THEN MATCH_MP_TAC + (REWRITE_RULE[] (ISPEC + `(\x y. x rem &3329 = y rem &3329)` ISUM_RELATED)) THEN + REWRITE_TAC[INT_REM_EQ; FINITE_NUMSEG; INT_CONG_ADD] THEN + X_GEN_TAC `i:num` THEN DISCH_TAC THEN + REWRITE_TAC[GSYM INT_OF_NUM_REM; GSYM INT_OF_NUM_CLAUSES; + GSYM INT_REM_EQ] THEN + CONV_TAC INT_REM_DOWN_CONV THEN + AP_THM_TAC THEN AP_TERM_TAC THEN CONV_TAC INT_ARITH);; + +let FORWARD_NTT_CONV = + GEN_REWRITE_CONV I [FORWARD_NTT_ALT] THENC + LAND_CONV EXPAND_ISUM_CONV THENC + DEPTH_CONV NUM_RED_CONV THENC + GEN_REWRITE_CONV ONCE_DEPTH_CONV [BITREVERSE7_CLAUSES] THENC + DEPTH_CONV NUM_RED_CONV THENC + GEN_REWRITE_CONV DEPTH_CONV [INT_OF_NUM_POW; INT_OF_NUM_REM] THENC + ONCE_DEPTH_CONV EXP_MOD_CONV THENC INT_REDUCE_CONV;; + +let AVX2_NTT_ORDER_CLAUSES = end_itlist CONJ (map + (GEN_REWRITE_CONV I [avx2_ntt_order] THENC DEPTH_CONV WORD_NUM_RED_CONV THENC + GEN_REWRITE_CONV I [BITREVERSE7_CLAUSES]) + (map (curry mk_comb `avx2_ntt_order` o mk_small_numeral) (0--127)));; + +let AVX2_FORWARD_NTT_CONV = + GEN_REWRITE_CONV I [AVX2_FORWARD_NTT_ALT] THENC + NUM_REDUCE_CONV THENC ONCE_DEPTH_CONV let_CONV THENC + LAND_CONV EXPAND_ISUM_CONV THENC + DEPTH_CONV NUM_RED_CONV THENC + GEN_REWRITE_CONV ONCE_DEPTH_CONV [AVX2_NTT_ORDER_CLAUSES] THENC + DEPTH_CONV NUM_RED_CONV THENC + GEN_REWRITE_CONV DEPTH_CONV [INT_OF_NUM_POW; INT_OF_NUM_REM] THENC + ONCE_DEPTH_CONV EXP_MOD_CONV THENC INT_REDUCE_CONV;; + +let INVERSE_NTT_CONV = + GEN_REWRITE_CONV I [INVERSE_NTT_ALT] THENC + LAND_CONV EXPAND_ISUM_CONV THENC + DEPTH_CONV NUM_RED_CONV THENC + GEN_REWRITE_CONV ONCE_DEPTH_CONV [BITREVERSE7_CLAUSES] THENC + DEPTH_CONV NUM_RED_CONV THENC + GEN_REWRITE_CONV DEPTH_CONV [INT_OF_NUM_POW; INT_OF_NUM_REM] THENC + ONCE_DEPTH_CONV EXP_MOD_CONV THENC INT_REDUCE_CONV;; + +(* ------------------------------------------------------------------------- *) +(* Abbreviate the Barrett reduction and multiplication and Montgomery *) +(* reduction patterns in the code. *) +(* ------------------------------------------------------------------------- *) + +let barred = define + `(barred:int16->int16) x = + word_sub x + (word_mul + (iword + ((ival + (iword_saturate((&2 * ival x * &20159) div &65536):int16) + &1024) div + &2048)) + (word 3329))`;; + +let barred_x86 = define + `(barred_x86:int16->int16) x = + word_sub + (x) + (word_mul + (word_ishr + (word_subword + (word_mul ((word_sx x):int32) (word 20159)) + (16,16)) + (10)) + (word 3329))`;; + +let barmul = define + `barmul (k,b) (a:int16):int16 = + word_sub (word_mul a b) + (word_mul (iword_saturate((&2 * ival a * k + &32768) div &65536)) + (word 3329))`;; + +let montred = define + `(montred:int32->int16) x = + word_subword + (word_add + (word_mul ((word_sx:int16->int32) + (word_mul (word_subword x (0,16)) (word 3327))) + (word 3329)) + x) + (16,16)`;; + +(* ------------------------------------------------------------------------- *) +(* For the x86 version of ML-KEM *) +(* ------------------------------------------------------------------------- *) + +let montmul_x86 = define + `montmul_x86 (x : int16) (y :int16) = + word_sub + (word_subword (word_mul (word_sx y : int32) (word_sx x)) (16,16) : int16) + (word_subword + (word_mul (word 3329) (word_sx (word_mul y (word_mul (word 62209) x)) : int32)) + (16,16)) + `;; + +let montmul_odd_x86 = prove + (`word_neg(montmul_x86 x y) = + word_sub + (word_subword + (word_mul (word 3329) (word_sx (word_mul y (word_mul (word 62209) x)) : int32)) + (16,16)) + (word_subword (word_mul (word_sx y : int32) (word_sx x)) (16,16) : int16)`, + REWRITE_TAC[montmul_x86] THEN CONV_TAC WORD_RULE);; + +let ntt_montmul = define + `ntt_montmul (a:int32, b:int16) (x:int16) = + word_sub + (word_subword (word_mul (word_sx (x:int16)) a:int32) (16,16):int16) + (word_subword + (word_mul (word_sx + ((word_mul (x:int16) b:int16))) + (word 3329:int32)) + (16,16))`;; + +let ntt_montmul_add = prove + (`word_add y (ntt_montmul (a, b) x) = + word_sub + (word_add + (y) + (word_subword (word_mul (word_sx (x:int16)) a:int32) (16,16):int16)) + (word_subword + (word_mul (word_sx + ((word_mul (x:int16) b:int16))) + (word 3329:int32)) + (16,16))`, + REWRITE_TAC[ntt_montmul] THEN CONV_TAC WORD_RULE);; + +let ntt_montmul_sub = prove + (`word_sub y (ntt_montmul (a, b) x) = + word_add + (word_sub + (y) + (word_subword (word_mul (word_sx (x:int16)) a:int32) (16,16):int16)) + (word_subword + (word_mul (word_sx + ((word_mul (x:int16) b:int16))) + (word 3329:int32)) + (16,16))`, + REWRITE_TAC[ntt_montmul] THEN CONV_TAC WORD_RULE);; + +(* ------------------------------------------------------------------------- *) +(* From |- (x == y) (mod m) /\ P to |- (x == y) (mod n) /\ P *) +(* ------------------------------------------------------------------------- *) + +let WEAKEN_INTCONG_RULE = + let prule = (MATCH_MP o prove) + (`(x:int == y) (mod m) ==> !n. m rem n = &0 ==> (x == y) (mod n)`, + REWRITE_TAC[INT_REM_EQ_0] THEN INTEGER_TAC) + and conv = GEN_REWRITE_CONV I [INT_REM_ZERO; INT_REM_REFL] ORELSEC + INT_REM_CONV + and zth = REFL `&0:int` in + let lrule n th = + let th1 = SPEC (mk_intconst n) (prule th) in + let th2 = LAND_CONV conv (lhand(concl th1)) in + MP th1 (EQ_MP (SYM th2) zth) in + fun n th -> + let th1,th2 = CONJ_PAIR th in + CONJ (lrule n th1) th2;; + +(* ------------------------------------------------------------------------- *) +(* Unify modulus and conjoin a pair of (x == y) (mod m) /\ P thoerems. *) +(* ------------------------------------------------------------------------- *) + +let UNIFY_INTCONG_RULE th1 th2 = + let p1 = dest_intconst (rand(rand(lhand(concl th1)))) + and p2 = dest_intconst (rand(rand(lhand(concl th2)))) in + let d = gcd_num p1 p2 in + CONJ (WEAKEN_INTCONG_RULE d th1) (WEAKEN_INTCONG_RULE d th2);; + +(* ------------------------------------------------------------------------- *) +(* Process list of ineqequality into standard congbounds for atomic terms. *) +(* ------------------------------------------------------------------------- *) + +let DIMINDEX_INT_REDUCE_CONV = + DEPTH_CONV(WORD_NUM_RED_CONV ORELSEC DIMINDEX_CONV) THENC + INT_REDUCE_CONV;; + +let PROCESS_BOUND_ASSUMPTIONS = + let cth = prove + (`(ival x <= b <=> + --(&2 pow (dimindex(:N) - 1)) <= ival x /\ ival x <= b) /\ + (b <= ival x <=> + b <= ival x /\ ival x <= &2 pow (dimindex(:N) - 1) - &1) /\ + (ival(x:N word) > b <=> + b + &1 <= ival x /\ ival x <= &2 pow (dimindex(:N) - 1) - &1) /\ + (b > ival(x:N word) <=> + --(&2 pow (dimindex(:N) - 1)) <= ival x /\ ival x <= b - &1) /\ + (ival(x:N word) >= b <=> + b <= ival x /\ ival x <= &2 pow (dimindex(:N) - 1) - &1) /\ + (b >= ival(x:N word) <=> + --(&2 pow (dimindex(:N) - 1)) <= ival x /\ ival x <= b) /\ + (ival(x:N word) < b <=> + --(&2 pow (dimindex(:N) - 1)) <= ival x /\ ival x <= b - &1) /\ + (b < ival(x:N word) <=> + b + &1 <= ival x /\ ival x <= &2 pow (dimindex(:N) - 1) - &1) /\ + (abs(ival(x:N word)) <= b <=> + --b <= ival x /\ ival x <= b) /\ + (abs(ival(x:N word)) < b <=> + &1 - b <= ival x /\ ival x <= b - &1)`, + REWRITE_TAC[IVAL_BOUND; INT_ARITH `x:int <= y - &1 <=> x < y`] THEN + INT_ARITH_TAC) + and pth = prove + (`!l u (x:N word). + l <= ival x /\ ival x <= u + ==> (ival x == ival x) (mod &0) /\ l <= ival x /\ ival x <= u`, + REPEAT STRIP_TAC THEN ASM_REWRITE_TAC[] THEN INTEGER_TAC) in + let rule = + MATCH_MP pth o + CONV_RULE (BINOP2_CONV (LAND_CONV DIMINDEX_INT_REDUCE_CONV) + (RAND_CONV DIMINDEX_INT_REDUCE_CONV)) o + GEN_REWRITE_RULE I [cth] in + let rec process lfn ths = + match ths with + [] -> lfn + | th::oths -> + let lfn' = + try let th' = rule th in + let tm = rand(concl th') in + if is_intconst (rand(rand tm)) && is_intconst (lhand(lhand tm)) + then (rand(lhand(rand tm)) |-> th') lfn + else lfn + with Failure _ -> lfn in + process lfn' oths in + process undefined;; + +(* ------------------------------------------------------------------------- *) +(* Congruence-and-bound propagation, just recursion on the expression tree. *) +(* ------------------------------------------------------------------------- *) + +let CONGBOUND_CONST = prove + (`!(x:N word) n. + ival x = n + ==> (ival x == n) (mod &0) /\ n <= ival x /\ ival x <= n`, + REPEAT STRIP_TAC THEN ASM_REWRITE_TAC[INT_LE_REFL] THEN INTEGER_TAC);; + +let CONGBOUND_ATOM = prove + (`!x:N word. (ival x == ival x) (mod &0) /\ + --(&2 pow (dimindex(:N) - 1)) <= ival x /\ + ival x <= &2 pow (dimindex(:N) - 1) - &1`, + GEN_TAC THEN REWRITE_TAC[INT_ARITH `x:int <= y - &1 <=> x < y`] THEN + REWRITE_TAC[IVAL_BOUND] THEN INTEGER_TAC);; + +let CONGBOUND_ATOM_GEN = prove + (`!x:N word. abs(ival x) <= n + ==> (ival x == ival x) (mod &0) /\ + --n <= ival x /\ ival x <= n`, + REWRITE_TAC[INTEGER_RULE `(x:int == x) (mod n)`] THEN INT_ARITH_TAC);; + +let CONGBOUND_IWORD = prove + (`!x. ((x == x') (mod p) /\ l <= x /\ x <= u) + ==> --(&2 pow (dimindex(:N) - 1)) <= l /\ + u <= &2 pow (dimindex(:N) - 1) - &1 + ==> (ival(iword x:N word) == x') (mod p) /\ + l <= ival(iword x:N word) /\ ival(iword x:N word) <= u`, + GEN_TAC THEN STRIP_TAC THEN STRIP_TAC THEN REWRITE_TAC[word_sx] THEN + W(MP_TAC o PART_MATCH (lhand o rand) IVAL_IWORD o + lhand o rand o rand o snd) THEN + ANTS_TAC THENL [ASM_INT_ARITH_TAC; DISCH_THEN SUBST1_TAC] THEN + ASM_REWRITE_TAC[]);; + +let CONGBOUND_WORD_SX = prove + (`!x:M word. + ((ival x == x') (mod p) /\ l <= ival x /\ ival x <= u) + ==> --(&2 pow (dimindex(:N) - 1)) <= l /\ + u <= &2 pow (dimindex(:N) - 1) - &1 + ==> (ival(word_sx x:N word) == x') (mod p) /\ + l <= ival(word_sx x:N word) /\ ival(word_sx x:N word) <= u`, + REWRITE_TAC[word_sx; CONGBOUND_IWORD]);; + +let CONGBOUND_WORD_NEG = prove + (`!x:N word. + ((ival x == x') (mod p) /\ lx <= ival x /\ ival x <= ux) + ==> --lx <= &2 pow (dimindex(:N) - 1) - &1 + ==> (ival(word_neg x) == --x') (mod p) /\ + --ux <= ival(word_neg x) /\ + ival(word_neg x) <= --lx`, + GEN_TAC THEN STRIP_TAC THEN STRIP_TAC THEN + SUBGOAL_THEN `ival(word_neg x:N word) = --(ival x)` SUBST1_TAC THENL + [REPEAT(POP_ASSUM MP_TAC) THEN WORD_ARITH_TAC; + ASM_SIMP_TAC[INTEGER_RULE + `(x:int == x') (mod p) ==> (--x == --x') (mod p)`] THEN + ASM_ARITH_TAC]);; + +let CONGBOUND_WORD_ADD = prove + (`!x y:N word. + ((ival x == x') (mod p) /\ lx <= ival x /\ ival x <= ux) /\ + ((ival y == y') (mod p) /\ ly <= ival y /\ ival y <= uy) + ==> --(&2 pow (dimindex(:N) - 1)) <= lx + ly /\ + ux + uy <= &2 pow (dimindex(:N) - 1) - &1 + ==> (ival(word_add x y) == x' + y') (mod p) /\ + lx + ly <= ival(word_add x y) /\ + ival(word_add x y) <= ux + uy`, + REPEAT GEN_TAC THEN REWRITE_TAC[WORD_ADD_IMODULAR; imodular] THEN + STRIP_TAC THEN STRIP_TAC THEN + MATCH_MP_TAC(REWRITE_RULE[IMP_IMP] CONGBOUND_IWORD) THEN + ASM_SIMP_TAC[INT_CONG_ADD] THEN ASM_INT_ARITH_TAC);; + +let CONGBOUND_WORD_SUB = prove + (`!x y:N word. + ((ival x == x') (mod p) /\ lx <= ival x /\ ival x <= ux) /\ + ((ival y == y') (mod p) /\ ly <= ival y /\ ival y <= uy) + ==> --(&2 pow (dimindex(:N) - 1)) <= lx - uy /\ + ux - ly <= &2 pow (dimindex(:N) - 1) - &1 + ==> (ival(word_sub x y) == x' - y') (mod p) /\ + lx - uy <= ival(word_sub x y) /\ + ival(word_sub x y) <= ux - ly`, + REPEAT GEN_TAC THEN REWRITE_TAC[WORD_SUB_IMODULAR; imodular] THEN + STRIP_TAC THEN STRIP_TAC THEN + MATCH_MP_TAC(REWRITE_RULE[IMP_IMP] CONGBOUND_IWORD) THEN + ASM_SIMP_TAC[INT_CONG_SUB] THEN ASM_INT_ARITH_TAC);; + +let CONGBOUND_WORD_MUL = prove + (`!x y:N word. + ((ival x == x') (mod p) /\ lx <= ival x /\ ival x <= ux) /\ + ((ival y == y') (mod p) /\ ly <= ival y /\ ival y <= uy) + ==> --(&2 pow (dimindex(:N) - 1)) + <= min (lx * ly) (min (lx * uy) (min (ux * ly) (ux * uy))) /\ + max (lx * ly) (max (lx * uy) (max (ux * ly) (ux * uy))) + <= &2 pow (dimindex(:N) - 1) - &1 + ==> (ival(word_mul x y) == x' * y') (mod p) /\ + min (lx * ly) (min (lx * uy) (min (ux * ly) (ux * uy))) + <= ival(word_mul x y) /\ + ival(word_mul x y) + <= max (lx * ly) (max (lx * uy) (max (ux * ly) (ux * uy)))`, + let lemma = prove + (`l:int <= x /\ x <= u + ==> !a. a * l <= a * x /\ a * x <= a * u \/ + a * u <= a * x /\ a * x <= a * l`, + MESON_TAC[INT_LE_NEGTOTAL; INT_LE_LMUL; + INT_ARITH `a * x:int <= a * y <=> --a * y <= --a * x`]) in + REPEAT GEN_TAC THEN + DISCH_THEN(CONJUNCTS_THEN(CONJUNCTS_THEN2 ASSUME_TAC MP_TAC)) THEN + DISCH_THEN(ASSUME_TAC o SPEC `ival(x:N word)` o MATCH_MP lemma) THEN + DISCH_THEN(MP_TAC o MATCH_MP lemma) THEN DISCH_THEN(fun th -> + ASSUME_TAC(SPEC `ly:int` th) THEN ASSUME_TAC(SPEC `uy:int` th)) THEN + REWRITE_TAC[WORD_MUL_IMODULAR; imodular] THEN STRIP_TAC THEN + MATCH_MP_TAC(REWRITE_RULE[IMP_IMP] CONGBOUND_IWORD) THEN + ASM_SIMP_TAC[INT_CONG_MUL] THEN ASM_INT_ARITH_TAC);; + +let CONGBOUND_BARRED = prove + (`!a a' l u. + ((ival a == a') (mod &3329) /\ l <= ival a /\ ival a <= u) + ==> (ival(barred a) == a') (mod &3329) /\ + -- &1664 <= ival(barred a) /\ ival(barred a) <= &1664`, + REPEAT GEN_TAC THEN STRIP_TAC THEN REWRITE_TAC[barred] THEN + REWRITE_TAC[iword_saturate; word_INT_MIN; word_INT_MAX; DIMINDEX_16] THEN + CONV_TAC(DEPTH_CONV WORD_NUM_RED_CONV) THEN + REPEAT(COND_CASES_TAC THENL + [FIRST_X_ASSUM(MATCH_MP_TAC o MATCH_MP (MESON[] `p ==> ~p ==> q`)) THEN + REWRITE_TAC[INT_GT; INT_NOT_LT] THEN BOUNDER_TAC[]; + ASM_REWRITE_TAC[]]) THEN + REWRITE_TAC[WORD_RULE + `word_sub a (word_mul b (word n)) = iword(ival a - ival b * &n)`] THEN + REPEAT(W(fun (asl,w) -> + let t = hd(sort free_in + (find_terms (can (term_match [] `ival(iword x)`)) w)) in + let th = PART_MATCH (lhand o rand) IVAL_IWORD t in + MP_TAC th THEN REWRITE_TAC[DIMINDEX_16] THEN + CONV_TAC NUM_REDUCE_CONV THEN + ANTS_TAC THENL [BOUNDER_TAC[]; DISCH_THEN SUBST1_TAC])) THEN + MATCH_MP_TAC(MESON[] + `(x == k) (mod n) /\ + (a <= x /\ x <= b) /\ + (a <= x /\ x <= b ==> ival(iword x:int16) = x) + ==> (ival(iword x:int16) == k) (mod n) /\ + a <= ival(iword x:int16) /\ ival(iword x:int16) <= b`) THEN + ASM_REWRITE_TAC[INTEGER_RULE + `(a - x * n:int == a') (mod n) <=> (a == a') (mod n)`] THEN + CONJ_TAC THENL + [MP_TAC(ISPEC `a:int16` IVAL_BOUND); + REPEAT STRIP_TAC THEN MATCH_MP_TAC IVAL_IWORD] THEN + REWRITE_TAC[DIMINDEX_16; ARITH] THEN ASM_INT_ARITH_TAC);; + +let CONGBOUND_BARRED_X86 = prove + (`!a a' l u. + ((ival a == a') (mod &3329) /\ l <= ival a /\ ival a <= u) + ==> (ival(barred_x86 a) == a') (mod &3329) /\ + &0 <= ival(barred_x86 a) /\ ival(barred_x86 a) < &6658`, + REPEAT GEN_TAC THEN STRIP_TAC THEN REWRITE_TAC[barred_x86] THEN + REWRITE_TAC[WORD_BLAST + `word_ishr (word_subword (x:int32) (16,16):int16) 10 = + word_sx(word_ishr x 26)`] THEN + REWRITE_TAC[WORD_RULE + `word_sub a (word_mul b (word n)) = iword(ival a - ival b * &n)`] THEN + REWRITE_TAC[BITBLAST_RULE + `ival(word_sx(word_ishr (x:int32) 26):int16) = ival(word_ishr x 26)`] THEN + REWRITE_TAC[WORD_MUL_IMODULAR; imodular; IVAL_WORD_ISHR] THEN + SIMP_TAC[IVAL_WORD_SX; DIMINDEX_32; DIMINDEX_16; ARITH] THEN + CONV_TAC WORD_REDUCE_CONV THEN + SUBGOAL_THEN + `ival(iword(ival(a:int16) * &20159):int32) = ival a * &20159` + SUBST1_TAC THENL + [MATCH_MP_TAC IVAL_IWORD THEN REWRITE_TAC[DIMINDEX_32] THEN BOUNDER_TAC[]; + ALL_TAC] THEN + W(MP_TAC o PART_MATCH (lhand o rand) IVAL_IWORD o + lhand o rator o lhand o snd) THEN + ANTS_TAC THENL + [MP_TAC(ISPEC `a:int16` IVAL_BOUND) THEN REWRITE_TAC[DIMINDEX_16] THEN + CONV_TAC NUM_REDUCE_CONV THEN INT_ARITH_TAC; + DISCH_THEN SUBST1_TAC] THEN + ASM_REWRITE_TAC[INTEGER_RULE + `(a - x * p:int == a') (mod p) <=> (a == a') (mod p)`] THEN + MP_TAC(ISPEC `a:int16` IVAL_BOUND) THEN REWRITE_TAC[DIMINDEX_16] THEN + CONV_TAC NUM_REDUCE_CONV THEN INT_ARITH_TAC + );; + +let CONGBOUND_BARMUL = prove + (`!a a' l u. + ((ival a == a') (mod &3329) /\ l <= ival a /\ ival a <= u) + ==> !k b. abs(k) <= &32767 /\ + (max (abs l) (abs u) * + abs(&65536 * ival b - &6658 * k) + &109150207) div &65536 + <= &32767 + ==> (ival(barmul(k,b) a) == a' * ival b) (mod &3329) /\ + --(max (abs l) (abs u) * + abs(&65536 * ival b - &6658 * k) + &109084672) + div &65536 + <= ival(barmul(k,b) a) /\ + ival(barmul(k,b) a) <= + (max (abs l) (abs u) * abs(&65536 * ival b - &6658 * k) + + &109150207) div &65536`, + REPEAT GEN_TAC THEN STRIP_TAC THEN REWRITE_TAC[INT_ABS_BOUNDS] THEN + REPEAT GEN_TAC THEN STRIP_TAC THEN REWRITE_TAC[barmul] THEN + REWRITE_TAC[iword_saturate; word_INT_MIN; word_INT_MAX; DIMINDEX_16] THEN + CONV_TAC(DEPTH_CONV WORD_NUM_RED_CONV) THEN + REPEAT(COND_CASES_TAC THENL + [FIRST_X_ASSUM(MATCH_MP_TAC o MATCH_MP (MESON[] `p ==> ~p ==> q`)) THEN + REWRITE_TAC[INT_GT; INT_NOT_LT] THEN ASM BOUNDER_TAC[]; + ASM_REWRITE_TAC[]]) THEN + REWRITE_TAC[WORD_RULE + `word_sub (word_mul a b) (word_mul (iword k) (word c)) = + iword(ival a * ival b - &c * k)`] THEN + MATCH_MP_TAC(MESON[] + `(x == k) (mod n) /\ + (a <= x /\ x <= b ==> ival(iword x:int16) = x) /\ + (a <= x /\ x <= b) + ==> (ival(iword x:int16) == k) (mod n) /\ + a <= ival(iword x:int16) /\ ival(iword x:int16) <= b`) THEN + ASM_SIMP_TAC[INTEGER_RULE + `(a:int == a') (mod n) ==> (a * b - n * c == a' * b) (mod n)`] THEN + CONJ_TAC THENL + [REPEAT STRIP_TAC THEN MATCH_MP_TAC IVAL_IWORD THEN + REWRITE_TAC[DIMINDEX_16; ARITH] THEN ASM_INT_ARITH_TAC; + ALL_TAC] THEN + MATCH_MP_TAC(INT_ARITH + `&65536 * l + &109084672 <= a * (&65536 * b - &6658 * k) /\ + a * (&65536 * b - &6658 * k) <= &65536 * u - &109084672 + ==> l <= a * b - &3329 * (&2 * a * k + &32768) div &65536 /\ + a * b - &3329 * (&2 * a * k + &32768) div &65536 <= u`) THEN + CONJ_TAC THENL + [MATCH_MP_TAC(INT_ARITH `abs(y):int <= --x ==> x <= y`); + MATCH_MP_TAC(INT_ARITH `abs(y):int <= x ==> y <= x`)] THEN + REWRITE_TAC[INT_ABS_MUL] THEN + TRANS_TAC INT_LE_TRANS + `max (abs l) (abs u) * abs(&65536 * ival(b:int16) - &6658 * k)` THEN + ASM_SIMP_TAC[INT_LE_RMUL; INT_ABS_POS; INT_ARITH + `l:int <= x /\ x <= u ==> abs x <= max (abs l) (abs u)`] THEN + CONV_TAC INT_ARITH);; + +let CONGBOUND_MONTMUL_X86 = prove + (`!x y. ((ival x == x') (mod &3329) /\ lx <= ival x /\ ival x <= ux) /\ + ((ival y == y') (mod &3329) /\ ly <= ival y /\ ival y <= uy) + ==> (ival(montmul_x86 x y) == + &(inverse_mod 3329 65536) * x' * y') (mod &3329) /\ + (min (lx * ly) (min (lx * uy) (min (ux * ly) (ux * uy))) - + &109081343) div &65536 <= ival(montmul_x86 x y) /\ + ival(montmul_x86 x y) + <= (max (lx * ly) (max (lx * uy) (max (ux * ly) (ux * uy))) + + &109150207) div &65536`, + let lemma = prove + (`l:int <= x /\ x <= u + ==> !a. a * l <= a * x /\ a * x <= a * u \/ + a * u <= a * x /\ a * x <= a * l`, + MESON_TAC[INT_LE_NEGTOTAL; INT_LE_LMUL; + INT_ARITH `a * x:int <= a * y <=> --a * y <= --a * x`]) + and ilemma = prove + (`!x:int32. ival(word_subword x (16,16):int16) = ival x div &2 pow 16`, + REWRITE_TAC[GSYM DIMINDEX_16; GSYM IVAL_WORD_ISHR] THEN + GEN_TAC THEN REWRITE_TAC[DIMINDEX_16] THEN BITBLAST_TAC) in + let mainlemma = prove + (`!x:int32 y:int32. + (ival x == ival y) (mod (&2 pow 16)) + ==> &2 pow 16 * + ival(word_sub (word_subword x (16,16)) + (word_subword y (16,16)):int16) = + ival(word_sub x y)`, + REPEAT STRIP_TAC THEN MATCH_MP_TAC(INT_ARITH + `b rem &2 pow 16 = &0 /\ a = &2 pow 16 * b div &2 pow 16 ==> a = b`) THEN + CONJ_TAC THENL + [REWRITE_TAC[WORD_SUB_IMODULAR; imodular; INT_REM_EQ_0] THEN + SIMP_TAC[INT_DIVIDES_IVAL_IWORD; DIMINDEX_32; ARITH] THEN + POP_ASSUM MP_TAC THEN CONV_TAC INTEGER_RULE; + AP_TERM_TAC THEN REWRITE_TAC[GSYM ilemma] THEN AP_TERM_TAC] THEN + FIRST_X_ASSUM(MP_TAC o GEN_REWRITE_RULE I [GSYM INT_REM_EQ]) THEN + SIMP_TAC[INT_REM_IVAL; DIMINDEX_16; DIMINDEX_32; ARITH] THEN + BITBLAST_TAC) in + REPEAT GEN_TAC THEN DISCH_TAC THEN + CONV_TAC NUM_REDUCE_CONV THEN CONV_TAC(ONCE_DEPTH_CONV INVERSE_MOD_CONV) THEN + MP_TAC(SPECL [`&169:int`; `(&2:int) pow 16`; `&3329:int`] (INTEGER_RULE + `!d e n:int. (e * d == &1) (mod n) + ==> !x y. ((x == d * y) (mod n) <=> (e * x == y) (mod n))`)) THEN + ANTS_TAC THENL + [REWRITE_TAC[GSYM INT_REM_EQ] THEN INT_ARITH_TAC; + DISCH_THEN(fun th -> REWRITE_TAC[th])] THEN + ONCE_REWRITE_TAC[INT_ARITH + `l:int <= x <=> &2 pow 16 * l <= &2 pow 16 * x`] THEN + REWRITE_TAC[montmul_x86] THEN + REWRITE_TAC[WORD_MUL_IMODULAR; imodular] THEN + SIMP_TAC[IVAL_WORD_SX; DIMINDEX_16; DIMINDEX_32; ARITH] THEN + CONV_TAC WORD_REDUCE_CONV THEN + REWRITE_TAC[WORD_RULE + `!x:int16 y:int16. + iword(ival y * ival(iword(c * ival x):int16)):int16 = + iword(c * ival x * ival y)`] THEN + W(MP_TAC o PART_MATCH (lhand o rand) mainlemma o + lhand o rator o lhand o snd) THEN + ANTS_TAC THENL + [SIMP_TAC[GSYM INT_REM_EQ; INT_REM_IVAL_IWORD; DIMINDEX_32; ARITH] THEN + ONCE_REWRITE_TAC[GSYM INT_MUL_REM] THEN + SIMP_TAC[INT_REM_IVAL_IWORD; DIMINDEX_16; ARITH; DIMINDEX_32] THEN + REWRITE_TAC[GSYM INT_REM_EQ] THEN CONV_TAC INT_REM_DOWN_CONV THEN + REWRITE_TAC[INT_REM_EQ] THEN MATCH_MP_TAC(INTEGER_RULE + `(a * b:int == &1) (mod p) ==> (y * x == a * b * x * y) (mod p)`) THEN + REWRITE_TAC[GSYM INT_REM_EQ] THEN INT_ARITH_TAC; + DISCH_THEN SUBST1_TAC THEN REWRITE_TAC[GSYM IWORD_INT_SUB]] THEN + W(MP_TAC o PART_MATCH (lhand o rand) IVAL_IWORD o + lhand o rator o lhand o snd) THEN + ANTS_TAC THENL + [REWRITE_TAC[DIMINDEX_32; ARITH] THEN BOUNDER_TAC[]; + DISCH_THEN SUBST1_TAC] THEN + ONCE_REWRITE_TAC[INT_ARITH `ival x * ival y = ival y * ival x`] THEN + ASM_SIMP_TAC[INTEGER_RULE + `(x:int == x') (mod p) /\ (y == y') (mod p) + ==> (x * y - p * z == x' * y') (mod p)`] THEN + MATCH_MP_TAC(INT_ARITH + `(l <= p /\ p <= u) /\ (&65535 - c <= q /\ q <= b) + ==> &2 pow 16 * (l - b) div &65536 <= p - q /\ + p - q <= &2 pow 16 * (u + c) div &65536`) THEN + CONJ_TAC THENL [ALL_TAC; BOUNDER_TAC[]] THEN + FIRST_X_ASSUM(CONJUNCTS_THEN(MP_TAC o CONJUNCT2)) THEN + DISCH_THEN(ASSUME_TAC o SPEC `ival(x:int16)` o MATCH_MP lemma) THEN + DISCH_THEN(MP_TAC o MATCH_MP lemma) THEN DISCH_THEN(fun th -> + ASSUME_TAC(SPEC `ly:int` th) THEN ASSUME_TAC(SPEC `uy:int` th)) THEN + ASM_INT_ARITH_TAC);; + +let MONTRED_LEMMA = prove + (`!x. &2 pow 16 * ival(montred x) = + ival(word_add + (word_mul (word_sx(iword(ival x * &3327):int16)) (word 3329)) x)`, + GEN_TAC THEN REWRITE_TAC[montred] THEN REWRITE_TAC[WORD_BLAST + `word_subword (x:int32) (0,16):int16 = word_sx x`] THEN + REWRITE_TAC[IWORD_INT_MUL; GSYM word_sx; GSYM WORD_IWORD] THEN + REWRITE_TAC[WORD_BLAST `(word_sx:int32->int16) x = word_zx x`] THEN + CONV_TAC INT_REDUCE_CONV THEN MATCH_MP_TAC(BITBLAST_RULE + `word_and x (word 65535):int32 = word 0 + ==> &65536 * ival(word_subword x (16,16):int16) = ival x`) THEN + REWRITE_TAC[BITBLAST_RULE + `word_and x (word 65535):int32 = word 0 <=> word_zx x:int16 = word 0`] THEN + W(MP_TAC o PART_MATCH (lhand o rand) WORD_ZX_ADD o lhand o snd) THEN + REWRITE_TAC[DIMINDEX_16; DIMINDEX_32; ARITH] THEN DISCH_THEN SUBST1_TAC THEN + W(MP_TAC o PART_MATCH (lhand o rand) WORD_ZX_MUL o lhand o lhand o snd) THEN + REWRITE_TAC[DIMINDEX_16; DIMINDEX_32; ARITH] THEN DISCH_THEN SUBST1_TAC THEN + REWRITE_TAC[WORD_BLAST `word_zx(word_sx (x:int16):int32) = x`] THEN + REWRITE_TAC[GSYM VAL_EQ_0; VAL_WORD_ADD; VAL_WORD_MUL; VAL_WORD] THEN + CONV_TAC MOD_DOWN_CONV THEN REWRITE_TAC[GSYM DIVIDES_MOD; DIMINDEX_16] THEN + CONV_TAC WORD_REDUCE_CONV THEN MATCH_MP_TAC(NUMBER_RULE + `(a * b + 1 == 0) (mod d) ==> d divides ((x * a) * b + x)`) THEN + REWRITE_TAC[CONG] THEN ARITH_TAC);; + +let CONGBOUND_MONTRED = prove + (`!a a' l u. + (ival a == a') (mod &3329) /\ l <= ival a /\ ival a <= u + ==> --(&2038398976) <= l /\ u <= &2038402304 + ==> (ival(montred a) == &(inverse_mod 3329 65536) * a') (mod &3329) /\ + (l - &109084672) div &2 pow 16 <= ival(montred a) /\ + ival(montred a) <= &1 + (u + &109081343) div &2 pow 16`, + REPEAT GEN_TAC THEN STRIP_TAC THEN STRIP_TAC THEN + CONV_TAC NUM_REDUCE_CONV THEN CONV_TAC(ONCE_DEPTH_CONV INVERSE_MOD_CONV) THEN + MP_TAC(SPECL [`&169:int`; `(&2:int) pow 16`; `&3329:int`] (INTEGER_RULE + `!d e n:int. (e * d == &1) (mod n) + ==> !x y. ((x == d * y) (mod n) <=> (e * x == y) (mod n))`)) THEN + ANTS_TAC THENL + [REWRITE_TAC[GSYM INT_REM_EQ] THEN INT_ARITH_TAC; + DISCH_THEN(fun th -> REWRITE_TAC[th])] THEN + ONCE_REWRITE_TAC[INT_ARITH + `l:int <= x <=> &2 pow 16 * l <= &2 pow 16 * x`] THEN + REWRITE_TAC[MONTRED_LEMMA] THEN + REWRITE_TAC[WORD_RULE + `word_add (word_mul a b) c = iword(ival a * ival b + ival c)`] THEN + ASM_SIMP_TAC[IVAL_WORD_SX; DIMINDEX_16; DIMINDEX_32; ARITH] THEN + W(MP_TAC o PART_MATCH (lhand o rand) IVAL_IWORD o + lhand o rator o lhand o snd) THEN + REWRITE_TAC[DIMINDEX_32] THEN CONV_TAC(DEPTH_CONV WORD_NUM_RED_CONV) THEN + W(MP_TAC o C ISPEC IVAL_BOUND o + rand o funpow 3 lhand o rand o lhand o lhand o snd) THEN + REWRITE_TAC[DIMINDEX_16; ARITH] THEN STRIP_TAC THEN + ANTS_TAC THENL [ASM_INT_ARITH_TAC; DISCH_THEN SUBST1_TAC] THEN + ASM_REWRITE_TAC[INTEGER_RULE + `(a * p + x:int == y) (mod p) <=> (x == y) (mod p)`] THEN + ASM_INT_ARITH_TAC);; + +let CONGBOUND_NTT_MONTMUL = prove + (`!x x' lx ux. + ((ival x == x') (mod &3329) /\ lx <= ival x /\ ival x <= ux) + ==> !a b. --(&32768) <= ival a /\ + ival a <= &32767 /\ + (&3329 * ival b) rem &65536 = ival a rem &65536 + ==> (ival(ntt_montmul (a,b) x) == + &(inverse_mod 3329 65536) * ival a * x') + (mod &3329) /\ + (min (ival a * lx) (ival a * ux) - &109081343) + div &65536 <= ival(ntt_montmul (a,b) x) /\ + ival(ntt_montmul (a,b) x) <= + (max (ival a * lx) (ival a * ux) + &109150208) + div &2 pow 16`, + let lemma = prove + (`l:int <= x /\ x <= u + ==> !a. a * l <= a * x /\ a * x <= a * u \/ + a * u <= a * x /\ a * x <= a * l`, + MESON_TAC[INT_LE_NEGTOTAL; INT_LE_LMUL; + INT_ARITH `a * x:int <= a * y <=> --a * y <= --a * x`]) + and ilemma = prove + (`!x:int32. ival(word_subword x (16,16):int16) = ival x div &2 pow 16`, + REWRITE_TAC[GSYM DIMINDEX_16; GSYM IVAL_WORD_ISHR] THEN + GEN_TAC THEN REWRITE_TAC[DIMINDEX_16] THEN BITBLAST_TAC) in + let mainlemma = prove + (`!x:int32 y:int32. + (ival x == ival y) (mod (&2 pow 16)) + ==> &2 pow 16 * + ival(word_sub (word_subword x (16,16)) + (word_subword y (16,16)):int16) = + ival(word_sub x y)`, + REPEAT STRIP_TAC THEN MATCH_MP_TAC(INT_ARITH + `b rem &2 pow 16 = &0 /\ a = &2 pow 16 * b div &2 pow 16 ==> a = b`) THEN + CONJ_TAC THENL + [REWRITE_TAC[WORD_SUB_IMODULAR; imodular; INT_REM_EQ_0] THEN + SIMP_TAC[INT_DIVIDES_IVAL_IWORD; DIMINDEX_32; ARITH] THEN + POP_ASSUM MP_TAC THEN CONV_TAC INTEGER_RULE; + AP_TERM_TAC THEN REWRITE_TAC[GSYM ilemma] THEN AP_TERM_TAC] THEN + FIRST_X_ASSUM(MP_TAC o GEN_REWRITE_RULE I [GSYM INT_REM_EQ]) THEN + SIMP_TAC[INT_REM_IVAL; DIMINDEX_16; DIMINDEX_32; ARITH] THEN + BITBLAST_TAC) in + REPEAT GEN_TAC THEN DISCH_TAC THEN REPEAT GEN_TAC THEN STRIP_TAC THEN + CONV_TAC NUM_REDUCE_CONV THEN CONV_TAC(ONCE_DEPTH_CONV INVERSE_MOD_CONV) THEN + MP_TAC(SPECL [`&169:int`; `(&2:int) pow 16`; `&3329:int`] (INTEGER_RULE + `!d e n:int. (e * d == &1) (mod n) + ==> !x y. ((x == d * y) (mod n) <=> (e * x == y) (mod n))`)) THEN + ANTS_TAC THENL + [REWRITE_TAC[GSYM INT_REM_EQ] THEN INT_ARITH_TAC; + DISCH_THEN(fun th -> REWRITE_TAC[th])] THEN + ONCE_REWRITE_TAC[INT_ARITH + `l:int <= x <=> &2 pow 16 * l <= &2 pow 16 * x`] THEN + REWRITE_TAC[ntt_montmul] THEN + REWRITE_TAC[WORD_MUL_IMODULAR; imodular] THEN + SIMP_TAC[IVAL_WORD_SX; DIMINDEX_32; DIMINDEX_32; ARITH] THEN + CONV_TAC WORD_REDUCE_CONV THEN + W(MP_TAC o PART_MATCH (lhand o rand) mainlemma o + lhand o rator o lhand o snd) THEN + ANTS_TAC THENL + [SIMP_TAC[GSYM INT_REM_EQ; INT_REM_IVAL_IWORD; DIMINDEX_32; ARITH] THEN + SIMP_TAC[IVAL_WORD_SX; DIMINDEX_16; DIMINDEX_32; ARITH_LE; ARITH_LT] THEN + ONCE_REWRITE_TAC[GSYM INT_MUL_REM] THEN + REWRITE_TAC[REWRITE_RULE[GSYM INT_REM_EQ] IVAL_IWORD_CONG; + GSYM DIMINDEX_16] THEN + REWRITE_TAC[DIMINDEX_16] THEN CONV_TAC INT_REM_DOWN_CONV THEN + REWRITE_TAC[GSYM INT_MUL_ASSOC] THEN + ONCE_REWRITE_TAC[GSYM INT_MUL_REM] THEN + AP_THM_TAC THEN AP_TERM_TAC THEN AP_TERM_TAC THEN + CONV_TAC INT_REDUCE_CONV THEN ASM_REWRITE_TAC[INT_MUL_SYM]; + DISCH_THEN SUBST1_TAC THEN REWRITE_TAC[GSYM IWORD_INT_SUB]] THEN + W(MP_TAC o PART_MATCH (lhand o rand) IVAL_IWORD o + lhand o rator o lhand o snd) THEN + ANTS_TAC THENL + [SIMP_TAC[IVAL_WORD_SX; DIMINDEX_16; DIMINDEX_32; ARITH_LE; ARITH_LT] THEN + REWRITE_TAC[DIMINDEX_32; ARITH] THEN ASM BOUNDER_TAC[]; + DISCH_THEN SUBST1_TAC] THEN + SIMP_TAC[IVAL_WORD_SX; DIMINDEX_16; DIMINDEX_32; ARITH_LE; ARITH_LT] THEN + ASM_SIMP_TAC[INTEGER_RULE + `(x:int == x') (mod p) ==> (x * a - q * p == a * x') (mod p)`] THEN + REWRITE_TAC[GSYM(INT_REDUCE_CONV `(&2:int) pow 16`)] THEN + MATCH_MP_TAC(INT_ARITH + `(l <= p /\ p <= u) /\ (&65535 - c <= q /\ q <= b) + ==> &2 pow 16 * (l - b) div &2 pow 16 <= p - q /\ + p - q <= &2 pow 16 * (u + c) div &2 pow 16`) THEN + CONJ_TAC THENL [ALL_TAC; BOUNDER_TAC[]] THEN + FIRST_X_ASSUM(MP_TAC o SPEC `ival(a:int32)` o + MATCH_MP lemma o CONJUNCT2) THEN + INT_ARITH_TAC);; + +let CONCL_BOUNDS_RULE = + CONV_RULE(BINOP2_CONV + (LAND_CONV(RAND_CONV DIMINDEX_INT_REDUCE_CONV)) + (BINOP2_CONV + (LAND_CONV DIMINDEX_INT_REDUCE_CONV) + (RAND_CONV DIMINDEX_INT_REDUCE_CONV)));; + +let SIDE_ELIM_RULE th = + MP th (EQT_ELIM(DIMINDEX_INT_REDUCE_CONV(lhand(concl th))));; + +let GEN_CONGBOUND_RULE aboths = + let lfn = PROCESS_BOUND_ASSUMPTIONS aboths in + let rec rule tm = + try apply lfn tm with Failure _ -> + match tm with + Comb(Const("word",_),n) when is_numeral n -> + let th1 = ISPEC tm CONGBOUND_CONST in + let th2 = WORD_RED_CONV(lhand(lhand(snd(strip_forall(concl th1))))) in + MATCH_MP th1 th2 + | Comb(Const("iword",_),n) when is_intconst n -> + let th0 = WORD_IWORD_CONV tm in + let th1 = ISPEC (rand(concl th0)) CONGBOUND_CONST in + let th2 = WORD_RED_CONV(lhand(lhand(snd(strip_forall(concl th1))))) in + SUBS[SYM th0] (MATCH_MP th1 th2) + | Comb(Comb(Const("barmul",_),kb),t) -> + let ktm,btm = dest_pair kb and th0 = rule t in + let th0' = WEAKEN_INTCONG_RULE (num 3329) th0 in + let th1 = SPECL [ktm;btm] (MATCH_MP CONGBOUND_BARMUL th0') in + CONCL_BOUNDS_RULE(SIDE_ELIM_RULE th1) + | Comb(Comb(Const("montmul_x86",_),ltm),rtm) -> + let lth = WEAKEN_INTCONG_RULE (num 3329) (rule ltm) + and rth = WEAKEN_INTCONG_RULE (num 3329) (rule rtm) in + let th1 = MATCH_MP CONGBOUND_MONTMUL_X86 + (UNIFY_INTCONG_RULE lth rth) in + CONCL_BOUNDS_RULE(th1) + | Comb(Const("barred",_),t) -> + let th1 = WEAKEN_INTCONG_RULE (num 3329) (rule t) in + MATCH_MP CONGBOUND_BARRED th1 + | Comb(Const("barred_x86",_),t) -> + let th1 = WEAKEN_INTCONG_RULE (num 3329) (rule t) in + MATCH_MP CONGBOUND_BARRED_X86 th1 + | Comb(Const("montred",_),t) -> + let th1 = WEAKEN_INTCONG_RULE (num 3329) (rule t) in + CONCL_BOUNDS_RULE(SIDE_ELIM_RULE(MATCH_MP CONGBOUND_MONTRED th1)) + | Comb(Comb(Const("ntt_montmul",_),ab),t) -> + let atm,btm = dest_pair ab and th0 = rule t in + let th0' = WEAKEN_INTCONG_RULE (num 3329) th0 in + let th1 = SPECL [atm;btm] (MATCH_MP CONGBOUND_NTT_MONTMUL th0') in + CONCL_BOUNDS_RULE(SIDE_ELIM_RULE th1) + | Comb(Const("word_sx",_),t) -> + let th0 = rule t in + let tyin = type_match + (type_of(rator(rand(lhand(funpow 4 rand (snd(dest_forall + (concl CONGBOUND_WORD_SX)))))))) (type_of(rator tm)) [] in + let th1 = MATCH_MP (INST_TYPE tyin CONGBOUND_WORD_SX) th0 in + CONCL_BOUNDS_RULE(SIDE_ELIM_RULE th1) + | Comb(Const("word_neg",_),t) -> + let th0 = rule t in + let th1 = MATCH_MP CONGBOUND_WORD_NEG th0 in + CONCL_BOUNDS_RULE(SIDE_ELIM_RULE th1) + | Comb(Comb(Const("word_add",_),ltm),rtm) -> + let lth = rule ltm and rth = rule rtm in + let th1 = MATCH_MP CONGBOUND_WORD_ADD (UNIFY_INTCONG_RULE lth rth) in + CONCL_BOUNDS_RULE(SIDE_ELIM_RULE th1) + | Comb(Comb(Const("word_sub",_),ltm),rtm) -> + let lth = rule ltm and rth = rule rtm in + let th1 = MATCH_MP CONGBOUND_WORD_SUB (UNIFY_INTCONG_RULE lth rth) in + CONCL_BOUNDS_RULE(SIDE_ELIM_RULE th1) + | Comb(Comb(Const("word_mul",_),ltm),rtm) -> + let lth = rule ltm and rth = rule rtm in + let th1 = MATCH_MP CONGBOUND_WORD_MUL (UNIFY_INTCONG_RULE lth rth) in + CONCL_BOUNDS_RULE(SIDE_ELIM_RULE th1) + | _ -> CONCL_BOUNDS_RULE(ISPEC tm CONGBOUND_ATOM) in + rule;; + +let CONGBOUND_RULE = GEN_CONGBOUND_RULE [];; + +(* ------------------------------------------------------------------------- *) +(* Simplify SIMD cruft and fold abbreviations when encountered. *) +(* ------------------------------------------------------------------------- *) + +let SIMD_SIMPLIFY_CONV unfold_defs = + TOP_DEPTH_CONV + (REWR_CONV WORD_SUBWORD_AND ORELSEC WORD_SIMPLE_SUBWORD_CONV) THENC + DEPTH_CONV WORD_NUM_RED_CONV THENC + REWRITE_CONV (map GSYM unfold_defs);; + +let SIMD_SIMPLIFY_TAC unfold_defs = + let arm_simdable = can (term_match [] `read X (s:armstate):int128 = whatever`) in + let x86_simdable = can (term_match [] `read X (s:x86state):int256 = whatever`) in + let simdable tm = arm_simdable tm || x86_simdable tm in + TRY(FIRST_X_ASSUM + (ASSUME_TAC o + CONV_RULE(RAND_CONV (SIMD_SIMPLIFY_CONV unfold_defs)) o + check (simdable o concl)));; diff --git a/proofs/hol_light/x86/Makefile b/proofs/hol_light/x86/Makefile new file mode 100644 index 0000000000..c03b02749e --- /dev/null +++ b/proofs/hol_light/x86/Makefile @@ -0,0 +1,136 @@ +############################################################################# +# Copyright (c) The mlkem-native project authors +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 +############################################################################# + +# +# This Makefile is derived from the Makefile x86/Makefile in s2n-bignum. +# - Remove all s2n-bignum proofs and tutorial, add mlkem-native proofs +# - Minor path modifications to support base theories from s2n-bignum +# to reside in a separate read-only directory +# + +.DEFAULT_GOAL := run_proofs + +OSTYPE_RESULT=$(shell uname -s) +ARCHTYPE_RESULT=$(shell uname -m) + +SRC ?= $(S2N_BIGNUM_DIR) +SRC_X86 ?= $(SRC)/x86 + +# Add explicit language input parameter to cpp, otherwise the use of #n for +# numeric literals in x86 code is a problem when used inside #define macros +# since normally that means stringization. +# +# Some clang-based preprocessors seem to behave differently, and get confused +# by single-quote characters in comments, so we eliminate // comments first. + +ifeq ($(OSTYPE_RESULT),Darwin) +PREPROCESS=sed -e 's/\/\/.*//' | $(CC) -E -xassembler-with-cpp - +else +PREPROCESS=$(CC) -E -xassembler-with-cpp - +endif + +# Generally GNU-type assemblers are happy with multiple instructions on +# a line, but we split them up anyway just in case. + +SPLIT=tr ';' '\n' + +# If actually on an x86_64 machine, just use the assembler (as). Otherwise +# use a cross-assembling version so that the code can still be assembled +# and the proofs checked against the object files (though you won't be able +# to run code without additional emulation infrastructure). For the clang +# version on OS X we just add the "-arch x86_64" option. For the Linux/gcc +# toolchain we assume the presence of the special cross-assembler. This +# can be installed via something like: +# +# sudo apt-get install binutils-x86-64-linux-gnu + +ifeq ($(ARCHTYPE_RESULT),x86_64) +ASSEMBLE=as +OBJDUMP=objdump -d +else +ifeq ($(OSTYPE_RESULT),Darwin) +ASSEMBLE=as -arch x86_64 +OBJDUMP=otool -tvV +else +ASSEMBLE=x86_64-linux-gnu-as +OBJDUMP=x86_64-linux-gnu-objdump -d +endif +endif + +OBJ = mlkem/mlkem_poly_basemul_acc_montgomery_cached_k2.o \ + mlkem/mlkem_poly_basemul_acc_montgomery_cached_k3.o \ + mlkem/mlkem_poly_basemul_acc_montgomery_cached_k4.o + +# Build object files from assembly sources +$(OBJ): %.o : %.S + @echo "Preparing $@ ..." + @echo "AS: `$(ASSEMBLE) --version`" + @echo "OBJDUMP: `$(OBJDUMP) --version`" + $(Q)[ -d $(@D) ] || mkdir -p $(@D) + cat $< | $(PREPROCESS) | $(SPLIT) | $(ASSEMBLE) -o $@ - + # MacOS may generate relocations in non-text sections that break + # the object file parser in HOL-Light + strip $@ + +clean:; rm -f */*.o */*/*.o */*.correct */*.native + +# Proof-related parts +# +# The proof files are all independent, though each one loads the +# same common infrastructure "base.ml". So you can potentially +# run the proofs in parallel for more speed, e.g. +# +# nohup make -j 16 proofs & +# +# If you build hol-light yourself (see https://github.com/jrh13/hol-light) +# in your home directory, and do "make" inside the subdirectory hol-light, +# then the following HOLDIR setting should be right: + +HOLDIR?=$(HOLLIGHTDIR) +HOLLIGHT:=$(HOLDIR)/hol.sh + +BASE?=$(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) + +PROOF_BINS = $(OBJ:.o=.native) +PROOF_LOGS = $(OBJ:.o=.correct) + +# Build precompiled binary for dumping bytecodes +proofs/dump_bytecode.native: proofs/dump_bytecode.ml $(OBJ) + ./proofs/build-proof.sh $(BASE)/$< "$(HOLLIGHT)" "$@" + +# Build precompiled native binaries of HOL Light proofs + +.SECONDEXPANSION: +%.native: proofs/$$(*F).ml %.o ; ./proofs/build-proof.sh $(BASE)/$< "$(HOLLIGHT)" "$@" + +# Run them and print the standard output+error at *.correct +%.correct: %.native + cd .. ; ./x86/$< 2>&1 | tee ./x86/$@ + @if (grep -i "error:\|exception:" "$@" >/dev/null); then \ + echo "$< had errors!"; \ + exit 1; \ + else \ + echo "$< OK"; \ + fi + +build_proofs: $(PROOF_BINS); +run_proofs: build_proofs $(PROOF_LOGS); + +proofs: run_proofs ; $(SRC)/tools/count-proofs.sh . + +dump_bytecode: proofs/dump_bytecode.native + ./$< + +.PHONY: proofs build_proofs run_proofs sematest clean dump_bytecode + +# Always run sematest regardless of dependency check +FORCE: ; +# Always use max. # of cores because in Makefile one cannot get the passed number of -j. +# A portable way of getting the number of max. cores: +# https://stackoverflow.com/a/23569003/1488216 +NUM_CORES_FOR_SEMATEST = $(shell getconf _NPROCESSORS_ONLN) +sematest: FORCE $(OBJ) $(SRC_X86)/proofs/simulator_iclasses.ml $(SRC_X86)/proofs/simulator.native + $(SRC)/tools/run-sematest.sh x86 $(NUM_CORES_FOR_SEMATEST) diff --git a/proofs/hol_light/x86/list_proofs.sh b/proofs/hol_light/x86/list_proofs.sh new file mode 100755 index 0000000000..cfe76d4d9a --- /dev/null +++ b/proofs/hol_light/x86/list_proofs.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash +# Copyright (c) The mlkem-native project authors +# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT +# +# This tiny script just lists the names of source files for which +# we have a spec and proof in HOL-Light. + +ROOT=$(git rev-parse --show-toplevel) +cd $ROOT +ls -1 proofs/hol_light/x86/mlkem/*.S | cut -d '/' -f 5 | sed 's/\.S//' diff --git a/proofs/hol_light/x86/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k2.S b/proofs/hol_light/x86/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k2.S new file mode 100644 index 0000000000..b60106ff98 --- /dev/null +++ b/proofs/hol_light/x86/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k2.S @@ -0,0 +1,496 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + + +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k2.S using scripts/simpasm. Do not modify it directly. + */ + + +.text +.balign 4 +#ifdef __APPLE__ +.global _PQCP_MLKEM_NATIVE_MLKEM768_polyvec_basemul_acc_montgomery_cached_asm_k2 +_PQCP_MLKEM_NATIVE_MLKEM768_polyvec_basemul_acc_montgomery_cached_asm_k2: +#else +.global PQCP_MLKEM_NATIVE_MLKEM768_polyvec_basemul_acc_montgomery_cached_asm_k2 +PQCP_MLKEM_NATIVE_MLKEM768_polyvec_basemul_acc_montgomery_cached_asm_k2: +#endif + + .cfi_startproc + endbr64 + movl $0xd010d01, %eax # imm = 0xD010D01 + vmovd %eax, %xmm0 + vpbroadcastd %xmm0, %ymm0 + movl $0xf301f301, %eax # imm = 0xF301F301 + vmovd %eax, %xmm1 + vpbroadcastd %xmm1, %ymm1 + vmovdqa (%rsi), %ymm2 + vmovdqa 0x20(%rsi), %ymm3 + vmovdqa (%rdx), %ymm4 + vmovdqa 0x20(%rdx), %ymm5 + vmovdqa (%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm8, %ymm13, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, (%rdi) + vmovdqa %ymm9, 0x20(%rdi) + vmovdqa 0x40(%rsi), %ymm2 + vmovdqa 0x60(%rsi), %ymm3 + vmovdqa 0x40(%rdx), %ymm4 + vmovdqa 0x60(%rdx), %ymm5 + vmovdqa 0x20(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm13, %ymm8, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x40(%rdi) + vmovdqa %ymm9, 0x60(%rdi) + vmovdqa 0x80(%rsi), %ymm2 + vmovdqa 0xa0(%rsi), %ymm3 + vmovdqa 0x80(%rdx), %ymm4 + vmovdqa 0xa0(%rdx), %ymm5 + vmovdqa 0x40(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm8, %ymm13, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x80(%rdi) + vmovdqa %ymm9, 0xa0(%rdi) + vmovdqa 0xc0(%rsi), %ymm2 + vmovdqa 0xe0(%rsi), %ymm3 + vmovdqa 0xc0(%rdx), %ymm4 + vmovdqa 0xe0(%rdx), %ymm5 + vmovdqa 0x60(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm13, %ymm8, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0xc0(%rdi) + vmovdqa %ymm9, 0xe0(%rdi) + vmovdqa 0x100(%rsi), %ymm2 + vmovdqa 0x120(%rsi), %ymm3 + vmovdqa 0x100(%rdx), %ymm4 + vmovdqa 0x120(%rdx), %ymm5 + vmovdqa 0x80(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm8, %ymm13, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x100(%rdi) + vmovdqa %ymm9, 0x120(%rdi) + vmovdqa 0x140(%rsi), %ymm2 + vmovdqa 0x160(%rsi), %ymm3 + vmovdqa 0x140(%rdx), %ymm4 + vmovdqa 0x160(%rdx), %ymm5 + vmovdqa 0xa0(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm13, %ymm8, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x140(%rdi) + vmovdqa %ymm9, 0x160(%rdi) + vmovdqa 0x180(%rsi), %ymm2 + vmovdqa 0x1a0(%rsi), %ymm3 + vmovdqa 0x180(%rdx), %ymm4 + vmovdqa 0x1a0(%rdx), %ymm5 + vmovdqa 0xc0(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm8, %ymm13, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x180(%rdi) + vmovdqa %ymm9, 0x1a0(%rdi) + vmovdqa 0x1c0(%rsi), %ymm2 + vmovdqa 0x1e0(%rsi), %ymm3 + vmovdqa 0x1c0(%rdx), %ymm4 + vmovdqa 0x1e0(%rdx), %ymm5 + vmovdqa 0xe0(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm13, %ymm8, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x1c0(%rdi) + vmovdqa %ymm9, 0x1e0(%rdi) + vmovdqa 0x200(%rsi), %ymm2 + vmovdqa 0x220(%rsi), %ymm3 + vmovdqa 0x200(%rdx), %ymm4 + vmovdqa 0x220(%rdx), %ymm5 + vmovdqa 0x100(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm8, %ymm13, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa (%rdi), %ymm8 + vmovdqa 0x20(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, (%rdi) + vmovdqa %ymm9, 0x20(%rdi) + vmovdqa 0x240(%rsi), %ymm2 + vmovdqa 0x260(%rsi), %ymm3 + vmovdqa 0x240(%rdx), %ymm4 + vmovdqa 0x260(%rdx), %ymm5 + vmovdqa 0x120(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm13, %ymm8, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0x40(%rdi), %ymm8 + vmovdqa 0x60(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x40(%rdi) + vmovdqa %ymm9, 0x60(%rdi) + vmovdqa 0x280(%rsi), %ymm2 + vmovdqa 0x2a0(%rsi), %ymm3 + vmovdqa 0x280(%rdx), %ymm4 + vmovdqa 0x2a0(%rdx), %ymm5 + vmovdqa 0x140(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm8, %ymm13, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0x80(%rdi), %ymm8 + vmovdqa 0xa0(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x80(%rdi) + vmovdqa %ymm9, 0xa0(%rdi) + vmovdqa 0x2c0(%rsi), %ymm2 + vmovdqa 0x2e0(%rsi), %ymm3 + vmovdqa 0x2c0(%rdx), %ymm4 + vmovdqa 0x2e0(%rdx), %ymm5 + vmovdqa 0x160(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm13, %ymm8, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0xc0(%rdi), %ymm8 + vmovdqa 0xe0(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0xc0(%rdi) + vmovdqa %ymm9, 0xe0(%rdi) + vmovdqa 0x300(%rsi), %ymm2 + vmovdqa 0x320(%rsi), %ymm3 + vmovdqa 0x300(%rdx), %ymm4 + vmovdqa 0x320(%rdx), %ymm5 + vmovdqa 0x180(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm8, %ymm13, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0x100(%rdi), %ymm8 + vmovdqa 0x120(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x100(%rdi) + vmovdqa %ymm9, 0x120(%rdi) + vmovdqa 0x340(%rsi), %ymm2 + vmovdqa 0x360(%rsi), %ymm3 + vmovdqa 0x340(%rdx), %ymm4 + vmovdqa 0x360(%rdx), %ymm5 + vmovdqa 0x1a0(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm13, %ymm8, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0x140(%rdi), %ymm8 + vmovdqa 0x160(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x140(%rdi) + vmovdqa %ymm9, 0x160(%rdi) + vmovdqa 0x380(%rsi), %ymm2 + vmovdqa 0x3a0(%rsi), %ymm3 + vmovdqa 0x380(%rdx), %ymm4 + vmovdqa 0x3a0(%rdx), %ymm5 + vmovdqa 0x1c0(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm8, %ymm13, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0x180(%rdi), %ymm8 + vmovdqa 0x1a0(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x180(%rdi) + vmovdqa %ymm9, 0x1a0(%rdi) + vmovdqa 0x3c0(%rsi), %ymm2 + vmovdqa 0x3e0(%rsi), %ymm3 + vmovdqa 0x3c0(%rdx), %ymm4 + vmovdqa 0x3e0(%rdx), %ymm5 + vmovdqa 0x1e0(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm13, %ymm8, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0x1c0(%rdi), %ymm8 + vmovdqa 0x1e0(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x1c0(%rdi) + vmovdqa %ymm9, 0x1e0(%rdi) + retq + .cfi_endproc diff --git a/proofs/hol_light/x86/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k3.S b/proofs/hol_light/x86/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k3.S new file mode 100644 index 0000000000..c3177ec341 --- /dev/null +++ b/proofs/hol_light/x86/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k3.S @@ -0,0 +1,744 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + + +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k3.S using scripts/simpasm. Do not modify it directly. + */ + + +.text +.balign 4 +#ifdef __APPLE__ +.global _PQCP_MLKEM_NATIVE_MLKEM768_polyvec_basemul_acc_montgomery_cached_asm_k3 +_PQCP_MLKEM_NATIVE_MLKEM768_polyvec_basemul_acc_montgomery_cached_asm_k3: +#else +.global PQCP_MLKEM_NATIVE_MLKEM768_polyvec_basemul_acc_montgomery_cached_asm_k3 +PQCP_MLKEM_NATIVE_MLKEM768_polyvec_basemul_acc_montgomery_cached_asm_k3: +#endif + + .cfi_startproc + endbr64 + movl $0xd010d01, %eax # imm = 0xD010D01 + vmovd %eax, %xmm0 + vpbroadcastd %xmm0, %ymm0 + movl $0xf301f301, %eax # imm = 0xF301F301 + vmovd %eax, %xmm1 + vpbroadcastd %xmm1, %ymm1 + vmovdqa (%rsi), %ymm2 + vmovdqa 0x20(%rsi), %ymm3 + vmovdqa (%rdx), %ymm4 + vmovdqa 0x20(%rdx), %ymm5 + vmovdqa (%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm8, %ymm13, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, (%rdi) + vmovdqa %ymm9, 0x20(%rdi) + vmovdqa 0x40(%rsi), %ymm2 + vmovdqa 0x60(%rsi), %ymm3 + vmovdqa 0x40(%rdx), %ymm4 + vmovdqa 0x60(%rdx), %ymm5 + vmovdqa 0x20(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm13, %ymm8, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x40(%rdi) + vmovdqa %ymm9, 0x60(%rdi) + vmovdqa 0x80(%rsi), %ymm2 + vmovdqa 0xa0(%rsi), %ymm3 + vmovdqa 0x80(%rdx), %ymm4 + vmovdqa 0xa0(%rdx), %ymm5 + vmovdqa 0x40(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm8, %ymm13, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x80(%rdi) + vmovdqa %ymm9, 0xa0(%rdi) + vmovdqa 0xc0(%rsi), %ymm2 + vmovdqa 0xe0(%rsi), %ymm3 + vmovdqa 0xc0(%rdx), %ymm4 + vmovdqa 0xe0(%rdx), %ymm5 + vmovdqa 0x60(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm13, %ymm8, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0xc0(%rdi) + vmovdqa %ymm9, 0xe0(%rdi) + vmovdqa 0x100(%rsi), %ymm2 + vmovdqa 0x120(%rsi), %ymm3 + vmovdqa 0x100(%rdx), %ymm4 + vmovdqa 0x120(%rdx), %ymm5 + vmovdqa 0x80(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm8, %ymm13, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x100(%rdi) + vmovdqa %ymm9, 0x120(%rdi) + vmovdqa 0x140(%rsi), %ymm2 + vmovdqa 0x160(%rsi), %ymm3 + vmovdqa 0x140(%rdx), %ymm4 + vmovdqa 0x160(%rdx), %ymm5 + vmovdqa 0xa0(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm13, %ymm8, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x140(%rdi) + vmovdqa %ymm9, 0x160(%rdi) + vmovdqa 0x180(%rsi), %ymm2 + vmovdqa 0x1a0(%rsi), %ymm3 + vmovdqa 0x180(%rdx), %ymm4 + vmovdqa 0x1a0(%rdx), %ymm5 + vmovdqa 0xc0(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm8, %ymm13, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x180(%rdi) + vmovdqa %ymm9, 0x1a0(%rdi) + vmovdqa 0x1c0(%rsi), %ymm2 + vmovdqa 0x1e0(%rsi), %ymm3 + vmovdqa 0x1c0(%rdx), %ymm4 + vmovdqa 0x1e0(%rdx), %ymm5 + vmovdqa 0xe0(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm13, %ymm8, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x1c0(%rdi) + vmovdqa %ymm9, 0x1e0(%rdi) + vmovdqa 0x200(%rsi), %ymm2 + vmovdqa 0x220(%rsi), %ymm3 + vmovdqa 0x200(%rdx), %ymm4 + vmovdqa 0x220(%rdx), %ymm5 + vmovdqa 0x100(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm8, %ymm13, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa (%rdi), %ymm8 + vmovdqa 0x20(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, (%rdi) + vmovdqa %ymm9, 0x20(%rdi) + vmovdqa 0x240(%rsi), %ymm2 + vmovdqa 0x260(%rsi), %ymm3 + vmovdqa 0x240(%rdx), %ymm4 + vmovdqa 0x260(%rdx), %ymm5 + vmovdqa 0x120(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm13, %ymm8, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0x40(%rdi), %ymm8 + vmovdqa 0x60(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x40(%rdi) + vmovdqa %ymm9, 0x60(%rdi) + vmovdqa 0x280(%rsi), %ymm2 + vmovdqa 0x2a0(%rsi), %ymm3 + vmovdqa 0x280(%rdx), %ymm4 + vmovdqa 0x2a0(%rdx), %ymm5 + vmovdqa 0x140(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm8, %ymm13, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0x80(%rdi), %ymm8 + vmovdqa 0xa0(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x80(%rdi) + vmovdqa %ymm9, 0xa0(%rdi) + vmovdqa 0x2c0(%rsi), %ymm2 + vmovdqa 0x2e0(%rsi), %ymm3 + vmovdqa 0x2c0(%rdx), %ymm4 + vmovdqa 0x2e0(%rdx), %ymm5 + vmovdqa 0x160(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm13, %ymm8, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0xc0(%rdi), %ymm8 + vmovdqa 0xe0(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0xc0(%rdi) + vmovdqa %ymm9, 0xe0(%rdi) + vmovdqa 0x300(%rsi), %ymm2 + vmovdqa 0x320(%rsi), %ymm3 + vmovdqa 0x300(%rdx), %ymm4 + vmovdqa 0x320(%rdx), %ymm5 + vmovdqa 0x180(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm8, %ymm13, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0x100(%rdi), %ymm8 + vmovdqa 0x120(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x100(%rdi) + vmovdqa %ymm9, 0x120(%rdi) + vmovdqa 0x340(%rsi), %ymm2 + vmovdqa 0x360(%rsi), %ymm3 + vmovdqa 0x340(%rdx), %ymm4 + vmovdqa 0x360(%rdx), %ymm5 + vmovdqa 0x1a0(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm13, %ymm8, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0x140(%rdi), %ymm8 + vmovdqa 0x160(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x140(%rdi) + vmovdqa %ymm9, 0x160(%rdi) + vmovdqa 0x380(%rsi), %ymm2 + vmovdqa 0x3a0(%rsi), %ymm3 + vmovdqa 0x380(%rdx), %ymm4 + vmovdqa 0x3a0(%rdx), %ymm5 + vmovdqa 0x1c0(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm8, %ymm13, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0x180(%rdi), %ymm8 + vmovdqa 0x1a0(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x180(%rdi) + vmovdqa %ymm9, 0x1a0(%rdi) + vmovdqa 0x3c0(%rsi), %ymm2 + vmovdqa 0x3e0(%rsi), %ymm3 + vmovdqa 0x3c0(%rdx), %ymm4 + vmovdqa 0x3e0(%rdx), %ymm5 + vmovdqa 0x1e0(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm13, %ymm8, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0x1c0(%rdi), %ymm8 + vmovdqa 0x1e0(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x1c0(%rdi) + vmovdqa %ymm9, 0x1e0(%rdi) + vmovdqa 0x400(%rsi), %ymm2 + vmovdqa 0x420(%rsi), %ymm3 + vmovdqa 0x400(%rdx), %ymm4 + vmovdqa 0x420(%rdx), %ymm5 + vmovdqa 0x200(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm8, %ymm13, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa (%rdi), %ymm8 + vmovdqa 0x20(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, (%rdi) + vmovdqa %ymm9, 0x20(%rdi) + vmovdqa 0x440(%rsi), %ymm2 + vmovdqa 0x460(%rsi), %ymm3 + vmovdqa 0x440(%rdx), %ymm4 + vmovdqa 0x460(%rdx), %ymm5 + vmovdqa 0x220(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm13, %ymm8, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0x40(%rdi), %ymm8 + vmovdqa 0x60(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x40(%rdi) + vmovdqa %ymm9, 0x60(%rdi) + vmovdqa 0x480(%rsi), %ymm2 + vmovdqa 0x4a0(%rsi), %ymm3 + vmovdqa 0x480(%rdx), %ymm4 + vmovdqa 0x4a0(%rdx), %ymm5 + vmovdqa 0x240(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm8, %ymm13, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0x80(%rdi), %ymm8 + vmovdqa 0xa0(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x80(%rdi) + vmovdqa %ymm9, 0xa0(%rdi) + vmovdqa 0x4c0(%rsi), %ymm2 + vmovdqa 0x4e0(%rsi), %ymm3 + vmovdqa 0x4c0(%rdx), %ymm4 + vmovdqa 0x4e0(%rdx), %ymm5 + vmovdqa 0x260(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm13, %ymm8, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0xc0(%rdi), %ymm8 + vmovdqa 0xe0(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0xc0(%rdi) + vmovdqa %ymm9, 0xe0(%rdi) + vmovdqa 0x500(%rsi), %ymm2 + vmovdqa 0x520(%rsi), %ymm3 + vmovdqa 0x500(%rdx), %ymm4 + vmovdqa 0x520(%rdx), %ymm5 + vmovdqa 0x280(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm8, %ymm13, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0x100(%rdi), %ymm8 + vmovdqa 0x120(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x100(%rdi) + vmovdqa %ymm9, 0x120(%rdi) + vmovdqa 0x540(%rsi), %ymm2 + vmovdqa 0x560(%rsi), %ymm3 + vmovdqa 0x540(%rdx), %ymm4 + vmovdqa 0x560(%rdx), %ymm5 + vmovdqa 0x2a0(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm13, %ymm8, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0x140(%rdi), %ymm8 + vmovdqa 0x160(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x140(%rdi) + vmovdqa %ymm9, 0x160(%rdi) + vmovdqa 0x580(%rsi), %ymm2 + vmovdqa 0x5a0(%rsi), %ymm3 + vmovdqa 0x580(%rdx), %ymm4 + vmovdqa 0x5a0(%rdx), %ymm5 + vmovdqa 0x2c0(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm8, %ymm13, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0x180(%rdi), %ymm8 + vmovdqa 0x1a0(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x180(%rdi) + vmovdqa %ymm9, 0x1a0(%rdi) + vmovdqa 0x5c0(%rsi), %ymm2 + vmovdqa 0x5e0(%rsi), %ymm3 + vmovdqa 0x5c0(%rdx), %ymm4 + vmovdqa 0x5e0(%rdx), %ymm5 + vmovdqa 0x2e0(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm13, %ymm8, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0x1c0(%rdi), %ymm8 + vmovdqa 0x1e0(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x1c0(%rdi) + vmovdqa %ymm9, 0x1e0(%rdi) + retq + .cfi_endproc diff --git a/proofs/hol_light/x86/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k4.S b/proofs/hol_light/x86/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k4.S new file mode 100644 index 0000000000..3c54979af3 --- /dev/null +++ b/proofs/hol_light/x86/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k4.S @@ -0,0 +1,992 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + + +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k4.S using scripts/simpasm. Do not modify it directly. + */ + + +.text +.balign 4 +#ifdef __APPLE__ +.global _PQCP_MLKEM_NATIVE_MLKEM768_polyvec_basemul_acc_montgomery_cached_asm_k4 +_PQCP_MLKEM_NATIVE_MLKEM768_polyvec_basemul_acc_montgomery_cached_asm_k4: +#else +.global PQCP_MLKEM_NATIVE_MLKEM768_polyvec_basemul_acc_montgomery_cached_asm_k4 +PQCP_MLKEM_NATIVE_MLKEM768_polyvec_basemul_acc_montgomery_cached_asm_k4: +#endif + + .cfi_startproc + endbr64 + movl $0xd010d01, %eax # imm = 0xD010D01 + vmovd %eax, %xmm0 + vpbroadcastd %xmm0, %ymm0 + movl $0xf301f301, %eax # imm = 0xF301F301 + vmovd %eax, %xmm1 + vpbroadcastd %xmm1, %ymm1 + vmovdqa (%rsi), %ymm2 + vmovdqa 0x20(%rsi), %ymm3 + vmovdqa (%rdx), %ymm4 + vmovdqa 0x20(%rdx), %ymm5 + vmovdqa (%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm8, %ymm13, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, (%rdi) + vmovdqa %ymm9, 0x20(%rdi) + vmovdqa 0x40(%rsi), %ymm2 + vmovdqa 0x60(%rsi), %ymm3 + vmovdqa 0x40(%rdx), %ymm4 + vmovdqa 0x60(%rdx), %ymm5 + vmovdqa 0x20(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm13, %ymm8, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x40(%rdi) + vmovdqa %ymm9, 0x60(%rdi) + vmovdqa 0x80(%rsi), %ymm2 + vmovdqa 0xa0(%rsi), %ymm3 + vmovdqa 0x80(%rdx), %ymm4 + vmovdqa 0xa0(%rdx), %ymm5 + vmovdqa 0x40(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm8, %ymm13, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x80(%rdi) + vmovdqa %ymm9, 0xa0(%rdi) + vmovdqa 0xc0(%rsi), %ymm2 + vmovdqa 0xe0(%rsi), %ymm3 + vmovdqa 0xc0(%rdx), %ymm4 + vmovdqa 0xe0(%rdx), %ymm5 + vmovdqa 0x60(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm13, %ymm8, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0xc0(%rdi) + vmovdqa %ymm9, 0xe0(%rdi) + vmovdqa 0x100(%rsi), %ymm2 + vmovdqa 0x120(%rsi), %ymm3 + vmovdqa 0x100(%rdx), %ymm4 + vmovdqa 0x120(%rdx), %ymm5 + vmovdqa 0x80(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm8, %ymm13, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x100(%rdi) + vmovdqa %ymm9, 0x120(%rdi) + vmovdqa 0x140(%rsi), %ymm2 + vmovdqa 0x160(%rsi), %ymm3 + vmovdqa 0x140(%rdx), %ymm4 + vmovdqa 0x160(%rdx), %ymm5 + vmovdqa 0xa0(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm13, %ymm8, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x140(%rdi) + vmovdqa %ymm9, 0x160(%rdi) + vmovdqa 0x180(%rsi), %ymm2 + vmovdqa 0x1a0(%rsi), %ymm3 + vmovdqa 0x180(%rdx), %ymm4 + vmovdqa 0x1a0(%rdx), %ymm5 + vmovdqa 0xc0(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm8, %ymm13, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x180(%rdi) + vmovdqa %ymm9, 0x1a0(%rdi) + vmovdqa 0x1c0(%rsi), %ymm2 + vmovdqa 0x1e0(%rsi), %ymm3 + vmovdqa 0x1c0(%rdx), %ymm4 + vmovdqa 0x1e0(%rdx), %ymm5 + vmovdqa 0xe0(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm13, %ymm8, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x1c0(%rdi) + vmovdqa %ymm9, 0x1e0(%rdi) + vmovdqa 0x200(%rsi), %ymm2 + vmovdqa 0x220(%rsi), %ymm3 + vmovdqa 0x200(%rdx), %ymm4 + vmovdqa 0x220(%rdx), %ymm5 + vmovdqa 0x100(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm8, %ymm13, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa (%rdi), %ymm8 + vmovdqa 0x20(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, (%rdi) + vmovdqa %ymm9, 0x20(%rdi) + vmovdqa 0x240(%rsi), %ymm2 + vmovdqa 0x260(%rsi), %ymm3 + vmovdqa 0x240(%rdx), %ymm4 + vmovdqa 0x260(%rdx), %ymm5 + vmovdqa 0x120(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm13, %ymm8, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0x40(%rdi), %ymm8 + vmovdqa 0x60(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x40(%rdi) + vmovdqa %ymm9, 0x60(%rdi) + vmovdqa 0x280(%rsi), %ymm2 + vmovdqa 0x2a0(%rsi), %ymm3 + vmovdqa 0x280(%rdx), %ymm4 + vmovdqa 0x2a0(%rdx), %ymm5 + vmovdqa 0x140(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm8, %ymm13, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0x80(%rdi), %ymm8 + vmovdqa 0xa0(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x80(%rdi) + vmovdqa %ymm9, 0xa0(%rdi) + vmovdqa 0x2c0(%rsi), %ymm2 + vmovdqa 0x2e0(%rsi), %ymm3 + vmovdqa 0x2c0(%rdx), %ymm4 + vmovdqa 0x2e0(%rdx), %ymm5 + vmovdqa 0x160(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm13, %ymm8, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0xc0(%rdi), %ymm8 + vmovdqa 0xe0(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0xc0(%rdi) + vmovdqa %ymm9, 0xe0(%rdi) + vmovdqa 0x300(%rsi), %ymm2 + vmovdqa 0x320(%rsi), %ymm3 + vmovdqa 0x300(%rdx), %ymm4 + vmovdqa 0x320(%rdx), %ymm5 + vmovdqa 0x180(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm8, %ymm13, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0x100(%rdi), %ymm8 + vmovdqa 0x120(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x100(%rdi) + vmovdqa %ymm9, 0x120(%rdi) + vmovdqa 0x340(%rsi), %ymm2 + vmovdqa 0x360(%rsi), %ymm3 + vmovdqa 0x340(%rdx), %ymm4 + vmovdqa 0x360(%rdx), %ymm5 + vmovdqa 0x1a0(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm13, %ymm8, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0x140(%rdi), %ymm8 + vmovdqa 0x160(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x140(%rdi) + vmovdqa %ymm9, 0x160(%rdi) + vmovdqa 0x380(%rsi), %ymm2 + vmovdqa 0x3a0(%rsi), %ymm3 + vmovdqa 0x380(%rdx), %ymm4 + vmovdqa 0x3a0(%rdx), %ymm5 + vmovdqa 0x1c0(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm8, %ymm13, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0x180(%rdi), %ymm8 + vmovdqa 0x1a0(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x180(%rdi) + vmovdqa %ymm9, 0x1a0(%rdi) + vmovdqa 0x3c0(%rsi), %ymm2 + vmovdqa 0x3e0(%rsi), %ymm3 + vmovdqa 0x3c0(%rdx), %ymm4 + vmovdqa 0x3e0(%rdx), %ymm5 + vmovdqa 0x1e0(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm13, %ymm8, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0x1c0(%rdi), %ymm8 + vmovdqa 0x1e0(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x1c0(%rdi) + vmovdqa %ymm9, 0x1e0(%rdi) + vmovdqa 0x400(%rsi), %ymm2 + vmovdqa 0x420(%rsi), %ymm3 + vmovdqa 0x400(%rdx), %ymm4 + vmovdqa 0x420(%rdx), %ymm5 + vmovdqa 0x200(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm8, %ymm13, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa (%rdi), %ymm8 + vmovdqa 0x20(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, (%rdi) + vmovdqa %ymm9, 0x20(%rdi) + vmovdqa 0x440(%rsi), %ymm2 + vmovdqa 0x460(%rsi), %ymm3 + vmovdqa 0x440(%rdx), %ymm4 + vmovdqa 0x460(%rdx), %ymm5 + vmovdqa 0x220(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm13, %ymm8, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0x40(%rdi), %ymm8 + vmovdqa 0x60(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x40(%rdi) + vmovdqa %ymm9, 0x60(%rdi) + vmovdqa 0x480(%rsi), %ymm2 + vmovdqa 0x4a0(%rsi), %ymm3 + vmovdqa 0x480(%rdx), %ymm4 + vmovdqa 0x4a0(%rdx), %ymm5 + vmovdqa 0x240(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm8, %ymm13, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0x80(%rdi), %ymm8 + vmovdqa 0xa0(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x80(%rdi) + vmovdqa %ymm9, 0xa0(%rdi) + vmovdqa 0x4c0(%rsi), %ymm2 + vmovdqa 0x4e0(%rsi), %ymm3 + vmovdqa 0x4c0(%rdx), %ymm4 + vmovdqa 0x4e0(%rdx), %ymm5 + vmovdqa 0x260(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm13, %ymm8, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0xc0(%rdi), %ymm8 + vmovdqa 0xe0(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0xc0(%rdi) + vmovdqa %ymm9, 0xe0(%rdi) + vmovdqa 0x500(%rsi), %ymm2 + vmovdqa 0x520(%rsi), %ymm3 + vmovdqa 0x500(%rdx), %ymm4 + vmovdqa 0x520(%rdx), %ymm5 + vmovdqa 0x280(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm8, %ymm13, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0x100(%rdi), %ymm8 + vmovdqa 0x120(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x100(%rdi) + vmovdqa %ymm9, 0x120(%rdi) + vmovdqa 0x540(%rsi), %ymm2 + vmovdqa 0x560(%rsi), %ymm3 + vmovdqa 0x540(%rdx), %ymm4 + vmovdqa 0x560(%rdx), %ymm5 + vmovdqa 0x2a0(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm13, %ymm8, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0x140(%rdi), %ymm8 + vmovdqa 0x160(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x140(%rdi) + vmovdqa %ymm9, 0x160(%rdi) + vmovdqa 0x580(%rsi), %ymm2 + vmovdqa 0x5a0(%rsi), %ymm3 + vmovdqa 0x580(%rdx), %ymm4 + vmovdqa 0x5a0(%rdx), %ymm5 + vmovdqa 0x2c0(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm8, %ymm13, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0x180(%rdi), %ymm8 + vmovdqa 0x1a0(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x180(%rdi) + vmovdqa %ymm9, 0x1a0(%rdi) + vmovdqa 0x5c0(%rsi), %ymm2 + vmovdqa 0x5e0(%rsi), %ymm3 + vmovdqa 0x5c0(%rdx), %ymm4 + vmovdqa 0x5e0(%rdx), %ymm5 + vmovdqa 0x2e0(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm13, %ymm8, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0x1c0(%rdi), %ymm8 + vmovdqa 0x1e0(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x1c0(%rdi) + vmovdqa %ymm9, 0x1e0(%rdi) + vmovdqa 0x600(%rsi), %ymm2 + vmovdqa 0x620(%rsi), %ymm3 + vmovdqa 0x600(%rdx), %ymm4 + vmovdqa 0x620(%rdx), %ymm5 + vmovdqa 0x300(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm8, %ymm13, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa (%rdi), %ymm8 + vmovdqa 0x20(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, (%rdi) + vmovdqa %ymm9, 0x20(%rdi) + vmovdqa 0x640(%rsi), %ymm2 + vmovdqa 0x660(%rsi), %ymm3 + vmovdqa 0x640(%rdx), %ymm4 + vmovdqa 0x660(%rdx), %ymm5 + vmovdqa 0x320(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm13, %ymm8, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0x40(%rdi), %ymm8 + vmovdqa 0x60(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x40(%rdi) + vmovdqa %ymm9, 0x60(%rdi) + vmovdqa 0x680(%rsi), %ymm2 + vmovdqa 0x6a0(%rsi), %ymm3 + vmovdqa 0x680(%rdx), %ymm4 + vmovdqa 0x6a0(%rdx), %ymm5 + vmovdqa 0x340(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm8, %ymm13, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0x80(%rdi), %ymm8 + vmovdqa 0xa0(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x80(%rdi) + vmovdqa %ymm9, 0xa0(%rdi) + vmovdqa 0x6c0(%rsi), %ymm2 + vmovdqa 0x6e0(%rsi), %ymm3 + vmovdqa 0x6c0(%rdx), %ymm4 + vmovdqa 0x6e0(%rdx), %ymm5 + vmovdqa 0x360(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm13, %ymm8, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0xc0(%rdi), %ymm8 + vmovdqa 0xe0(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0xc0(%rdi) + vmovdqa %ymm9, 0xe0(%rdi) + vmovdqa 0x700(%rsi), %ymm2 + vmovdqa 0x720(%rsi), %ymm3 + vmovdqa 0x700(%rdx), %ymm4 + vmovdqa 0x720(%rdx), %ymm5 + vmovdqa 0x380(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm8, %ymm13, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0x100(%rdi), %ymm8 + vmovdqa 0x120(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x100(%rdi) + vmovdqa %ymm9, 0x120(%rdi) + vmovdqa 0x740(%rsi), %ymm2 + vmovdqa 0x760(%rsi), %ymm3 + vmovdqa 0x740(%rdx), %ymm4 + vmovdqa 0x760(%rdx), %ymm5 + vmovdqa 0x3a0(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm13, %ymm8, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0x140(%rdi), %ymm8 + vmovdqa 0x160(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x140(%rdi) + vmovdqa %ymm9, 0x160(%rdi) + vmovdqa 0x780(%rsi), %ymm2 + vmovdqa 0x7a0(%rsi), %ymm3 + vmovdqa 0x780(%rdx), %ymm4 + vmovdqa 0x7a0(%rdx), %ymm5 + vmovdqa 0x3c0(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm8, %ymm13, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0x180(%rdi), %ymm8 + vmovdqa 0x1a0(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x180(%rdi) + vmovdqa %ymm9, 0x1a0(%rdi) + vmovdqa 0x7c0(%rsi), %ymm2 + vmovdqa 0x7e0(%rsi), %ymm3 + vmovdqa 0x7c0(%rdx), %ymm4 + vmovdqa 0x7e0(%rdx), %ymm5 + vmovdqa 0x3e0(%rcx), %ymm6 + vpmullw %ymm2, %ymm1, %ymm13 + vpmullw %ymm3, %ymm1, %ymm14 + vpmullw %ymm13, %ymm4, %ymm7 + vpmullw %ymm13, %ymm5, %ymm9 + vpmullw %ymm14, %ymm6, %ymm8 + vpmullw %ymm14, %ymm4, %ymm10 + vpmulhw %ymm7, %ymm0, %ymm7 + vpmulhw %ymm9, %ymm0, %ymm9 + vpmulhw %ymm8, %ymm0, %ymm8 + vpmulhw %ymm10, %ymm0, %ymm10 + vpmulhw %ymm2, %ymm4, %ymm11 + vpmulhw %ymm2, %ymm5, %ymm12 + vpmulhw %ymm3, %ymm6, %ymm13 + vpmulhw %ymm3, %ymm4, %ymm14 + vpsubw %ymm7, %ymm11, %ymm7 + vpsubw %ymm9, %ymm12, %ymm9 + vpsubw %ymm13, %ymm8, %ymm8 + vpsubw %ymm10, %ymm14, %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa 0x1c0(%rdi), %ymm8 + vmovdqa 0x1e0(%rdi), %ymm10 + vpaddw %ymm7, %ymm8, %ymm7 + vpaddw %ymm9, %ymm10, %ymm9 + vmovdqa %ymm7, 0x1c0(%rdi) + vmovdqa %ymm9, 0x1e0(%rdi) + retq + .cfi_endproc diff --git a/proofs/hol_light/x86/proofs/build-proof.sh b/proofs/hol_light/x86/proofs/build-proof.sh new file mode 100755 index 0000000000..2025ecdba1 --- /dev/null +++ b/proofs/hol_light/x86/proofs/build-proof.sh @@ -0,0 +1,69 @@ +#!/usr/bin/env bash + +# +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 +# + +# +# Compile proof script into executable binary +# + +# This file is derived from s2n-bignum's tools/build-proof.sh +# Notable changes: +# - Modify HOL-Light's inline_load.ml to allow for a 3rd include path, +# in addition to S2N_BIGNUM_DIR and HOLLIGHT_DIR +# - Removal of s2n-bignum specific code that is not relevant for +# the mlkem-native proofs. + +ROOT="$(realpath "$(dirname "$0")"/../..)" + +if [ "$#" -ne 3 ]; then + echo "${ROOT}/build-proof.sh <.ml file path> " + echo "This script builds HOL Light proof using OCaml native compiler and puts the " + echo "output binary at ." + exit 1 +fi + +# Return the exit code if any statement fails +set -e + +ml_path="$1" +hol_sh_cmd=$2 +output_path=$3 +output_dir=$(dirname "$output_path") +[ -d "$output_dir" ] || mkdir -p "$output_dir" + +export HOLLIGHT_DIR="$(dirname ${hol_sh_cmd})" +if [ ! -f "${HOLLIGHT_DIR}/hol_lib.cmxa" ]; then + echo "hol_lib.cmxa does not exist in HOLLIGHT_DIR('${HOLLIGHT_DIR}')." + echo "Did you compile HOL Light with HOLLIGHT_USE_MODULE set to 1?" + exit 1 +fi + +template_ml="$(mktemp).ml" +echo "Generating a template .ml that loads the file...: ${template_ml}" + +( + echo 'let proof_start_time = Unix.time();;' + echo "loadt \"${ml_path}\";;" + echo "check_axioms ();;" + echo 'let proof_end_time = Unix.time();;' + echo 'Printf.printf "Running time: %f sec, Start unixtime: %f, End unixtime: %f\n" (proof_end_time -. proof_start_time) proof_start_time proof_end_time;;' +) >>${template_ml} + +inlined_prefix="$(mktemp)" +inlined_ml="${inlined_prefix}.ml" +inlined_cmx="${inlined_prefix}.cmx" +(cd "${S2N_BIGNUM_DIR}" && HOLLIGHT_LOAD_PATH=${ROOT} ocaml ${HOLLIGHT_DIR}/inline_load.ml "${template_ml}" "${inlined_ml}") + +# Give a large stack size. +OCAMLRUNPARAM=l=2000000000 \ + ocamlopt.byte -pp "$(${hol_sh_cmd} -pp)" -I "${HOLLIGHT_DIR}" -I +unix -c \ + hol_lib.cmxa ${inlined_ml} -o ${inlined_cmx} -w -a +ocamlfind ocamlopt -package zarith,unix -linkpkg hol_lib.cmxa \ + -I "${HOLLIGHT_DIR}" ${inlined_cmx} \ + -o "${output_path}" + +# Remove the intermediate files to save disk space +rm -f ${inlined_cmx} ${template_ml} ${inlined_ml} diff --git a/proofs/hol_light/x86/proofs/dump_bytecode.ml b/proofs/hol_light/x86/proofs/dump_bytecode.ml new file mode 100644 index 0000000000..8da8460c7a --- /dev/null +++ b/proofs/hol_light/x86/proofs/dump_bytecode.ml @@ -0,0 +1,18 @@ +(* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + *) + +needs "x86/proofs/base.ml";; + +print_string "=== bytecode start: mlkem/mlkem_poly_basemul_acc_montgomery_cached_k2.o ===\n";; +print_literal_from_elf "mlkem/mlkem_poly_basemul_acc_montgomery_cached_k2.o";; +print_string "==== bytecode end =====================================\n\n";; + +print_string "=== bytecode start: mlkem/mlkem_poly_basemul_acc_montgomery_cached_k3.o ===\n";; +print_literal_from_elf "mlkem/mlkem_poly_basemul_acc_montgomery_cached_k3.o";; +print_string "==== bytecode end =====================================\n\n";; + +print_string "=== bytecode start: /mlkem/mlkem_poly_basemul_acc_montgomery_cached_k4.o ===\n";; +print_literal_from_elf "mlkem/mlkem_poly_basemul_acc_montgomery_cached_k4.o";; +print_string "==== bytecode end =====================================\n\n";; diff --git a/proofs/hol_light/x86/proofs/mlkem_poly_basemul_acc_montgomery_cached_k2.ml b/proofs/hol_light/x86/proofs/mlkem_poly_basemul_acc_montgomery_cached_k2.ml new file mode 100644 index 0000000000..f76ad90d5d --- /dev/null +++ b/proofs/hol_light/x86/proofs/mlkem_poly_basemul_acc_montgomery_cached_k2.ml @@ -0,0 +1,1170 @@ +(* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + *) + +(* ========================================================================= *) +(* Scalar multiplication of 2-element polynomial vectors in NTT domain. *) +(* ========================================================================= *) + +needs "x86/proofs/base.ml";; + +needs "common/mlkem_specs.ml";; + +let mlkem_basemul_k2_mc = + define_assert_from_elf "mlkem_basemul_k2_mc" "x86/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k2.o" +(*** BYTECODE START ***) +[ + 0xf3; 0x0f; 0x1e; 0xfa; (* ENDBR64 *) + 0xb8; 0x01; 0x0d; 0x01; 0x0d; + (* MOV (% eax) (Imm32 (word 218172673)) *) + 0xc5; 0xf9; 0x6e; 0xc0; (* VMOVD (%_% xmm0) (% eax) *) + 0xc4; 0xe2; 0x7d; 0x58; 0xc0; + (* VPBROADCASTD (%_% ymm0) (%_% xmm0) *) + 0xb8; 0x01; 0xf3; 0x01; 0xf3; + (* MOV (% eax) (Imm32 (word 4076991233)) *) + 0xc5; 0xf9; 0x6e; 0xc8; (* VMOVD (%_% xmm1) (% eax) *) + 0xc4; 0xe2; 0x7d; 0x58; 0xc9; + (* VPBROADCASTD (%_% ymm1) (%_% xmm1) *) + 0xc5; 0xfd; 0x6f; 0x16; (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,0))) *) + 0xc5; 0xfd; 0x6f; 0x5e; 0x20; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,32))) *) + 0xc5; 0xfd; 0x6f; 0x22; (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,0))) *) + 0xc5; 0xfd; 0x6f; 0x6a; 0x20; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,32))) *) + 0xc5; 0xfd; 0x6f; 0x31; (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,0))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x15; 0xf9; 0xc0; + (* VPSUBW (%_% ymm8) (%_% ymm13) (%_% ymm8) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0x3f; (* VMOVDQA (Memop Word256 (%% (rdi,0))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x4f; 0x20; + (* VMOVDQA (Memop Word256 (%% (rdi,32))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x56; 0x40; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,64))) *) + 0xc5; 0xfd; 0x6f; 0x5e; 0x60; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,96))) *) + 0xc5; 0xfd; 0x6f; 0x62; 0x40; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,64))) *) + 0xc5; 0xfd; 0x6f; 0x6a; 0x60; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,96))) *) + 0xc5; 0xfd; 0x6f; 0x71; 0x20; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,32))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x3d; 0xf9; 0xc5; + (* VPSUBW (%_% ymm8) (%_% ymm8) (%_% ymm13) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0x7f; 0x40; + (* VMOVDQA (Memop Word256 (%% (rdi,64))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x4f; 0x60; + (* VMOVDQA (Memop Word256 (%% (rdi,96))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,128))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,160))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,128))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,160))) *) + 0xc5; 0xfd; 0x6f; 0x71; 0x40; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,64))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x15; 0xf9; 0xc0; + (* VPSUBW (%_% ymm8) (%_% ymm13) (%_% ymm8) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,128))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,160))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0xc0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,192))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0xe0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,224))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0xc0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,192))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0xe0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,224))) *) + 0xc5; 0xfd; 0x6f; 0x71; 0x60; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,96))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x3d; 0xf9; 0xc5; + (* VPSUBW (%_% ymm8) (%_% ymm8) (%_% ymm13) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0xc0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,192))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0xe0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,224))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x00; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,256))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0x20; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,288))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x00; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,256))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0x20; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,288))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,128))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x15; 0xf9; 0xc0; + (* VPSUBW (%_% ymm8) (%_% ymm13) (%_% ymm8) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0x00; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,256))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0x20; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,288))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x40; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,320))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0x60; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,352))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x40; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,320))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0x60; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,352))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,160))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x3d; 0xf9; 0xc5; + (* VPSUBW (%_% ymm8) (%_% ymm8) (%_% ymm13) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0x40; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,320))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0x60; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,352))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x80; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,384))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0xa0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,416))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x80; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,384))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0xa0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,416))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0xc0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,192))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x15; 0xf9; 0xc0; + (* VPSUBW (%_% ymm8) (%_% ymm13) (%_% ymm8) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0x80; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,384))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0xa0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,416))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0xc0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,448))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0xe0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,480))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0xc0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,448))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0xe0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,480))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0xe0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,224))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x3d; 0xf9; 0xc5; + (* VPSUBW (%_% ymm8) (%_% ymm8) (%_% ymm13) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0xc0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,448))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0xe0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,480))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x00; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,512))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0x20; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,544))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x00; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,512))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0x20; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,544))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0x00; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,256))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x15; 0xf9; 0xc0; + (* VPSUBW (%_% ymm8) (%_% ymm13) (%_% ymm8) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x07; (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,0))) *) + 0xc5; 0x7d; 0x6f; 0x57; 0x20; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,32))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0x3f; (* VMOVDQA (Memop Word256 (%% (rdi,0))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x4f; 0x20; + (* VMOVDQA (Memop Word256 (%% (rdi,32))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x40; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,576))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0x60; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,608))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x40; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,576))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0x60; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,608))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0x20; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,288))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x3d; 0xf9; 0xc5; + (* VPSUBW (%_% ymm8) (%_% ymm8) (%_% ymm13) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x47; 0x40; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,64))) *) + 0xc5; 0x7d; 0x6f; 0x57; 0x60; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,96))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0x7f; 0x40; + (* VMOVDQA (Memop Word256 (%% (rdi,64))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x4f; 0x60; + (* VMOVDQA (Memop Word256 (%% (rdi,96))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x80; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,640))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0xa0; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,672))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x80; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,640))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0xa0; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,672))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0x40; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,320))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x15; 0xf9; 0xc0; + (* VPSUBW (%_% ymm8) (%_% ymm13) (%_% ymm8) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x87; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,128))) *) + 0xc5; 0x7d; 0x6f; 0x97; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,160))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,128))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,160))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0xc0; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,704))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0xe0; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,736))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0xc0; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,704))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0xe0; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,736))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0x60; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,352))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x3d; 0xf9; 0xc5; + (* VPSUBW (%_% ymm8) (%_% ymm8) (%_% ymm13) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x87; 0xc0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,192))) *) + 0xc5; 0x7d; 0x6f; 0x97; 0xe0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,224))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0xc0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,192))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0xe0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,224))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x00; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,768))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0x20; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,800))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x00; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,768))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0x20; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,800))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0x80; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,384))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x15; 0xf9; 0xc0; + (* VPSUBW (%_% ymm8) (%_% ymm13) (%_% ymm8) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x87; 0x00; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,256))) *) + 0xc5; 0x7d; 0x6f; 0x97; 0x20; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,288))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0x00; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,256))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0x20; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,288))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x40; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,832))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0x60; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,864))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x40; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,832))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0x60; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,864))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0xa0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,416))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x3d; 0xf9; 0xc5; + (* VPSUBW (%_% ymm8) (%_% ymm8) (%_% ymm13) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x87; 0x40; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,320))) *) + 0xc5; 0x7d; 0x6f; 0x97; 0x60; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,352))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0x40; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,320))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0x60; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,352))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x80; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,896))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0xa0; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,928))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x80; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,896))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0xa0; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,928))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0xc0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,448))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x15; 0xf9; 0xc0; + (* VPSUBW (%_% ymm8) (%_% ymm13) (%_% ymm8) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x87; 0x80; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,384))) *) + 0xc5; 0x7d; 0x6f; 0x97; 0xa0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,416))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0x80; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,384))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0xa0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,416))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0xc0; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,960))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0xe0; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,992))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0xc0; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,960))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0xe0; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,992))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0xe0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,480))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x3d; 0xf9; 0xc5; + (* VPSUBW (%_% ymm8) (%_% ymm8) (%_% ymm13) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x87; 0xc0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,448))) *) + 0xc5; 0x7d; 0x6f; 0x97; 0xe0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,480))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0xc0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,448))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0xe0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,480))) (%_% ymm9) *) + 0xc3 (* RET *) +];; +(*** BYTECODE END ***) + +let mlkem_basemul_k2_tmc = define_trimmed "mlkem_basemul_k2_tmc" mlkem_basemul_k2_mc;; +let mlkem_basemul_k2_tmc_EXEC = X86_MK_CORE_EXEC_RULE mlkem_basemul_k2_tmc;; + +(* Enable simplification of word_subwords by default. + Nedded to prevent the symbolic simulation to explode + as we add more instructions. *) +let org_extra_word_conv = !extra_word_CONV;; +extra_word_CONV := [WORD_SIMPLE_SUBWORD_CONV] @ !extra_word_CONV;; + + +(* (a + bX) * (c + dX) = (a*c + b*dz) + (a*d + b*c)X *) +let pmul0 = define + `pmul0 (a: int) (b : int) (c : int) (dz : int) = b*dz + a*c`;; + +let pmul0_odd = define + `pmul0_odd (a: int) (b : int) (c : int) (dz : int) = a*c - b*dz`;; + +let pmul1 = define + `pmul1 (a: int) (b : int) (c : int) (d : int) = b*c + a*d`;; + +let pmulacc0 = define + `pmulacc0 (a0: int) (b0 : int) (c0 : int) (d0 : int) (dz0 : int) + (a1: int) (b1 : int) (c1 : int) (d1 : int) (dz1 : int) = + pmul0 a0 b0 c0 dz0 + pmul0 a1 b1 c1 dz1`;; + +let pmulacc0_odd = define + `pmulacc0_odd (a0: int) (b0 : int) (c0 : int) (d0 : int) (dz0 : int) + (a1: int) (b1 : int) (c1 : int) (d1 : int) (dz1 : int) = + pmul0_odd a0 b0 c0 dz0 + pmul0_odd a1 b1 c1 dz1`;; + +let pmulacc1 = define + `pmulacc1 (a0: int) (b0 : int) (c0 : int) (d0 : int) (dz0 : int) + (a1: int) (b1 : int) (c1 : int) (d1 : int) (dz1 : int) = + pmul1 a0 b0 c0 d0 + pmul1 a1 b1 c1 d1`;; + +let pmulaccred0 = define + `pmulaccred0 (a0: int) (b0 : int) (c0 : int) (d0 : int) (dz0 : int) + (a1: int) (b1 : int) (c1 : int) (d1 : int) (dz1 : int) = + (&(inverse_mod 3329 65536) * pmulacc0 a0 b0 c0 d0 dz0 a1 b1 c1 d1 dz1) rem &3329`;; + +let pmulaccred0_odd = define + `pmulaccred0_odd (a0: int) (b0 : int) (c0 : int) (d0 : int) (dz0 : int) + (a1: int) (b1 : int) (c1 : int) (d1 : int) (dz1 : int) = + (&(inverse_mod 3329 65536) * pmulacc0_odd a0 b0 c0 d0 dz0 a1 b1 c1 d1 dz1) rem &3329`;; + + +let pmulaccred1 = define + `pmulaccred1 (a0: int) (b0 : int) (c0 : int) (d0 : int) (dz0 : int) + (a1: int) (b1 : int) (c1 : int) (d1 : int) (dz1 : int) = + (&(inverse_mod 3329 65536) * pmulacc1 a0 b0 c0 d0 dz0 a1 b1 c1 d1 dz1) rem &3329`;; + +let MLKEM_BASEMUL_K2_CORRECT = prove( + `!src1 src2 src2t dst a0 b0 c0 d0 dz0 a1 b1 c1 d1 dz1 pc. + aligned 32 src1 /\ + aligned 32 src2 /\ + aligned 32 src2t /\ + aligned 32 dst /\ + ALL (nonoverlapping (dst, 512)) [(src1, 1024); (src2, 1024); (src2t, 512)] /\ + nonoverlapping (dst, 512) (word pc, 2502) + ==> ensures x86 + (\s. bytes_loaded s (word pc) (BUTLAST mlkem_basemul_k2_tmc) /\ + read RIP s = word pc /\ + C_ARGUMENTS [dst; src1; src2; src2t] s /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (64*j + 2*i)))) s = a0 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (64*j + 32 + 2*i)))) s = b0 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (64*j + 2*i)))) s = c0 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (64*j + 32 + 2*i)))) s = d0 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2t (word (32*j + 2*i)))) s = dz0 i j) /\ + + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (512 + 64*j + 2*i)))) s = a1 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (512 + 64*j + 32 + 2*i)))) s = b1 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (512 + 64*j + 2*i)))) s = c1 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (512 + 64*j + 32 + 2*i)))) s = d1 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2t (word (256 + 32*j + 2*i)))) s = dz1 i j)) + (\s. read RIP s = word (pc + 2502) /\ + (!i. i < 16 ==> !j. j < 4 + ==> (let j' = 2*j in + (abs(ival(a0 i j')) <= &2 pow 12 /\ + abs(ival(b0 i j')) <= &2 pow 12 /\ + abs(ival(a1 i j')) <= &2 pow 12 /\ + abs(ival(b1 i j')) <= &2 pow 12 + ==> + (ival (read(memory :> bytes16 (word_add dst (word (64*j' + 2*i)))) s) + == + pmulaccred0 (ival (a0 i j')) (ival (b0 i j')) (ival (c0 i j')) (ival (d0 i j')) (ival (dz0 i j')) + (ival (a1 i j')) (ival (b1 i j')) (ival (c1 i j')) (ival (d1 i j')) (ival (dz1 i j')) + ) (mod &3329)))) /\ + + (!i. i < 16 ==> !j. j < 4 + ==> (let j' = 2*j+1 in + (abs(ival(a0 i j')) <= &2 pow 12 /\ + abs(ival(b0 i j')) <= &2 pow 12 /\ + abs(ival(a1 i j')) <= &2 pow 12 /\ + abs(ival(b1 i j')) <= &2 pow 12 + ==> + (ival (read(memory :> bytes16 (word_add dst (word (64*j' + 2*i)))) s) + == + pmulaccred0_odd (ival (a0 i j')) (ival (b0 i j')) (ival (c0 i j')) (ival (d0 i j')) (ival (dz0 i j')) + (ival (a1 i j')) (ival (b1 i j')) (ival (c1 i j')) (ival (d1 i j')) (ival (dz1 i j')) + ) (mod &3329)))) /\ + + (!i. i < 16 ==> !j. j < 8 + ==> abs(ival(a0 i j)) <= &2 pow 12 /\ + abs(ival(b0 i j)) <= &2 pow 12 /\ + abs(ival(a1 i j)) <= &2 pow 12 /\ + abs(ival(b1 i j)) <= &2 pow 12 + ==> (ival (read(memory :> bytes16 (word_add dst (word (64*j + 32 + 2*i)))) s) + == + pmulaccred1 (ival (a0 i j)) (ival (b0 i j)) (ival (c0 i j)) (ival (d0 i j)) (ival (dz0 i j)) + (ival (a1 i j)) (ival (b1 i j)) (ival (c1 i j)) (ival (d1 i j)) (ival (dz1 i j)) + ) (mod &3329))) + (MAYCHANGE [events] ,, + MAYCHANGE [RIP] ,, MAYCHANGE [RAX] ,, + MAYCHANGE [ZMM0; ZMM1; ZMM2; ZMM3; ZMM4; ZMM5; ZMM6; ZMM7; + ZMM8; ZMM9; ZMM10; ZMM11; ZMM12; ZMM13; ZMM14] ,, + MAYCHANGE [memory :> bytes(dst, 512)])`, + + REWRITE_TAC [MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI; + NONOVERLAPPING_CLAUSES; ALL; C_ARGUMENTS; fst mlkem_basemul_k2_tmc_EXEC] THEN + REPEAT STRIP_TAC THEN + + GHOST_INTRO_TAC `init_ymm0:int256` `read YMM0` THEN + GHOST_INTRO_TAC `init_ymm1:int256` `read YMM1` THEN + + CONV_TAC(RATOR_CONV(LAND_CONV(TOP_DEPTH_CONV EXPAND_CASES_CONV))) THEN + CONV_TAC(TOP_DEPTH_CONV NUM_MULT_CONV THENC + TOP_DEPTH_CONV NUM_ADD_CONV) THEN + + ENSURES_INIT_TAC "s0" THEN + + MEMORY_256_FROM_16_TAC "src1" 64 THEN + MEMORY_256_FROM_16_TAC "src2" 64 THEN + MEMORY_256_FROM_16_TAC "src2t" 32 THEN + + ASM_REWRITE_TAC [WORD_ADD_0] THEN + DISCARD_MATCHING_ASSUMPTIONS [`read (memory :> bytes16 any) s = x`] THEN + REPEAT STRIP_TAC THEN + + MAP_EVERY (fun n -> X86_STEPS_TAC mlkem_basemul_k2_tmc_EXEC [n] THEN + SIMD_SIMPLIFY_TAC [montmul_x86; montmul_odd_x86]) + (1--470) THEN + + ENSURES_FINAL_STATE_TAC THEN + ASM_REWRITE_TAC[] THEN + + REPEAT(FIRST_X_ASSUM(STRIP_ASSUME_TAC o + CONV_RULE(SIMD_SIMPLIFY_CONV[]) o + CONV_RULE(READ_MEMORY_SPLIT_CONV 4) o + check (can (term_match [] `read qqq s:int256 = xxx`) o concl))) THEN + + CONV_TAC(TOP_DEPTH_CONV EXPAND_CASES_CONV) THEN + CONV_TAC(DEPTH_CONV NUM_MULT_CONV THENC + DEPTH_CONV NUM_ADD_CONV THENC + DEPTH_CONV let_CONV) THEN + CONV_TAC(DEPTH_CONV NUM_MULT_CONV THENC + DEPTH_CONV NUM_ADD_CONV THENC + DEPTH_CONV let_CONV) THEN + ASM_REWRITE_TAC[WORD_ADD_0] THEN + + DISCARD_STATE_TAC "s470" THEN + + REPEAT CONJ_TAC THEN + REWRITE_TAC[pmulaccred0; pmulacc0; pmul0; pmulaccred0_odd; + pmulacc0_odd; pmul0_odd; pmulaccred1; pmulacc1; pmul1] THEN + STRIP_TAC THEN + ASSUM_LIST((fun ths -> W(MP_TAC o CONJUNCT1 o GEN_CONGBOUND_RULE ths o + rand o lhand o rator o snd))) THEN + REWRITE_TAC[GSYM INT_REM_EQ] THEN CONV_TAC INT_REM_DOWN_CONV THEN + MATCH_MP_TAC EQ_IMP THEN AP_TERM_TAC THEN AP_THM_TAC THEN AP_TERM_TAC THEN + CONV_TAC INT_RING +);; + +let MLKEM_BASEMUL_K2_NOIBT_SUBROUTINE_CORRECT = prove( + `!src1 src2 src2t dst a0 b0 c0 d0 dz0 a1 b1 c1 d1 dz1 pc stackpointer returnaddress. + aligned 32 src1 /\ + aligned 32 src2 /\ + aligned 32 src2t /\ + aligned 32 dst /\ + ALL (nonoverlapping (dst, 512)) [(src1, 1024); (src2, 1024); (src2t, 512)] /\ + nonoverlapping (dst, 512) (word pc, LENGTH mlkem_basemul_k2_tmc) /\ + nonoverlapping (dst, 512) (stackpointer, 8) + ==> ensures x86 + (\s. bytes_loaded s (word pc) mlkem_basemul_k2_tmc /\ + read RIP s = word pc /\ + read RSP s = stackpointer /\ + read (memory :> bytes64 stackpointer) s = returnaddress /\ + C_ARGUMENTS [dst; src1; src2; src2t] s /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (64*j + 2*i)))) s = a0 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (64*j + 32 + 2*i)))) s = b0 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (64*j + 2*i)))) s = c0 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (64*j + 32 + 2*i)))) s = d0 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2t (word (32*j + 2*i)))) s = dz0 i j) /\ + + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (512 + 64*j + 2*i)))) s = a1 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (512 + 64*j + 32 + 2*i)))) s = b1 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (512 + 64*j + 2*i)))) s = c1 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (512 + 64*j + 32 + 2*i)))) s = d1 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2t (word (256 + 32*j + 2*i)))) s = dz1 i j)) + (\s. read RIP s = returnaddress /\ + read RSP s = word_add stackpointer (word 8) /\ + (!i. i < 16 ==> !j. j < 4 + ==> (let j' = 2*j in + (abs(ival(a0 i j')) <= &2 pow 12 /\ + abs(ival(b0 i j')) <= &2 pow 12 /\ + abs(ival(a1 i j')) <= &2 pow 12 /\ + abs(ival(b1 i j')) <= &2 pow 12 + ==> + (ival (read(memory :> bytes16 (word_add dst (word (64*j' + 2*i)))) s) + == + pmulaccred0 (ival (a0 i j')) (ival (b0 i j')) (ival (c0 i j')) (ival (d0 i j')) (ival (dz0 i j')) + (ival (a1 i j')) (ival (b1 i j')) (ival (c1 i j')) (ival (d1 i j')) (ival (dz1 i j')) + ) (mod &3329)))) /\ + + (!i. i < 16 ==> !j. j < 4 + ==> (let j' = 2*j+1 in + (abs(ival(a0 i j')) <= &2 pow 12 /\ + abs(ival(b0 i j')) <= &2 pow 12 /\ + abs(ival(a1 i j')) <= &2 pow 12 /\ + abs(ival(b1 i j')) <= &2 pow 12 + ==> + (ival (read(memory :> bytes16 (word_add dst (word (64*j' + 2*i)))) s) + == + pmulaccred0_odd (ival (a0 i j')) (ival (b0 i j')) (ival (c0 i j')) (ival (d0 i j')) (ival (dz0 i j')) + (ival (a1 i j')) (ival (b1 i j')) (ival (c1 i j')) (ival (d1 i j')) (ival (dz1 i j')) + ) (mod &3329)))) /\ + + (!i. i < 16 ==> !j. j < 8 + ==> abs(ival(a0 i j)) <= &2 pow 12 /\ + abs(ival(b0 i j)) <= &2 pow 12 /\ + abs(ival(a1 i j)) <= &2 pow 12 /\ + abs(ival(b1 i j)) <= &2 pow 12 + ==> (ival (read(memory :> bytes16 (word_add dst (word (64*j + 32 + 2*i)))) s) + == + pmulaccred1 (ival (a0 i j)) (ival (b0 i j)) (ival (c0 i j)) (ival (d0 i j)) (ival (dz0 i j)) + (ival (a1 i j)) (ival (b1 i j)) (ival (c1 i j)) (ival (d1 i j)) (ival (dz1 i j)) + ) (mod &3329))) + (MAYCHANGE [RSP] ,, MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [memory :> bytes(dst, 512)])`, + X86_PROMOTE_RETURN_NOSTACK_TAC mlkem_basemul_k2_tmc MLKEM_BASEMUL_K2_CORRECT);; + +let MLKEM_BASEMUL_K2_SUBROUTINE_CORRECT = prove( + `!src1 src2 src2t dst a0 b0 c0 d0 dz0 a1 b1 c1 d1 dz1 pc stackpointer returnaddress. + aligned 32 src1 /\ + aligned 32 src2 /\ + aligned 32 src2t /\ + aligned 32 dst /\ + ALL (nonoverlapping (dst, 512)) [(src1, 1024); (src2, 1024); (src2t, 512)] /\ + nonoverlapping (dst, 512) (word pc, LENGTH mlkem_basemul_k2_mc) /\ + nonoverlapping (dst, 512) (stackpointer, 8) + ==> ensures x86 + (\s. bytes_loaded s (word pc) mlkem_basemul_k2_mc /\ + read RIP s = word pc /\ + read RSP s = stackpointer /\ + read (memory :> bytes64 stackpointer) s = returnaddress /\ + C_ARGUMENTS [dst; src1; src2; src2t] s /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (64*j + 2*i)))) s = a0 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (64*j + 32 + 2*i)))) s = b0 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (64*j + 2*i)))) s = c0 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (64*j + 32 + 2*i)))) s = d0 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2t (word (32*j + 2*i)))) s = dz0 i j) /\ + + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (512 + 64*j + 2*i)))) s = a1 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (512 + 64*j + 32 + 2*i)))) s = b1 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (512 + 64*j + 2*i)))) s = c1 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (512 + 64*j + 32 + 2*i)))) s = d1 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2t (word (256 + 32*j + 2*i)))) s = dz1 i j)) + (\s. read RIP s = returnaddress /\ + read RSP s = word_add stackpointer (word 8) /\ + (!i. i < 16 ==> !j. j < 4 + ==> (let j' = 2*j in + (abs(ival(a0 i j')) <= &2 pow 12 /\ + abs(ival(b0 i j')) <= &2 pow 12 /\ + abs(ival(a1 i j')) <= &2 pow 12 /\ + abs(ival(b1 i j')) <= &2 pow 12 + ==> + (ival (read(memory :> bytes16 (word_add dst (word (64*j' + 2*i)))) s) + == + pmulaccred0 (ival (a0 i j')) (ival (b0 i j')) (ival (c0 i j')) (ival (d0 i j')) (ival (dz0 i j')) + (ival (a1 i j')) (ival (b1 i j')) (ival (c1 i j')) (ival (d1 i j')) (ival (dz1 i j')) + ) (mod &3329)))) /\ + + (!i. i < 16 ==> !j. j < 4 + ==> (let j' = 2*j+1 in + (abs(ival(a0 i j')) <= &2 pow 12 /\ + abs(ival(b0 i j')) <= &2 pow 12 /\ + abs(ival(a1 i j')) <= &2 pow 12 /\ + abs(ival(b1 i j')) <= &2 pow 12 + ==> + (ival (read(memory :> bytes16 (word_add dst (word (64*j' + 2*i)))) s) + == + pmulaccred0_odd (ival (a0 i j')) (ival (b0 i j')) (ival (c0 i j')) (ival (d0 i j')) (ival (dz0 i j')) + (ival (a1 i j')) (ival (b1 i j')) (ival (c1 i j')) (ival (d1 i j')) (ival (dz1 i j')) + ) (mod &3329)))) /\ + + (!i. i < 16 ==> !j. j < 8 + ==> abs(ival(a0 i j)) <= &2 pow 12 /\ + abs(ival(b0 i j)) <= &2 pow 12 /\ + abs(ival(a1 i j)) <= &2 pow 12 /\ + abs(ival(b1 i j)) <= &2 pow 12 + ==> (ival (read(memory :> bytes16 (word_add dst (word (64*j + 32 + 2*i)))) s) + == + pmulaccred1 (ival (a0 i j)) (ival (b0 i j)) (ival (c0 i j)) (ival (d0 i j)) (ival (dz0 i j)) + (ival (a1 i j)) (ival (b1 i j)) (ival (c1 i j)) (ival (d1 i j)) (ival (dz1 i j)) + ) (mod &3329))) + (MAYCHANGE [RSP] ,, MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [memory :> bytes(dst, 512)])`, + MATCH_ACCEPT_TAC(ADD_IBT_RULE MLKEM_BASEMUL_K2_NOIBT_SUBROUTINE_CORRECT));; + diff --git a/proofs/hol_light/x86/proofs/mlkem_poly_basemul_acc_montgomery_cached_k3.ml b/proofs/hol_light/x86/proofs/mlkem_poly_basemul_acc_montgomery_cached_k3.ml new file mode 100644 index 0000000000..d02ffba509 --- /dev/null +++ b/proofs/hol_light/x86/proofs/mlkem_poly_basemul_acc_montgomery_cached_k3.ml @@ -0,0 +1,1665 @@ +(* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + *) + +(* ========================================================================= *) +(* Scalar multiplication of 2-element polynomial vectors in NTT domain. *) +(* ========================================================================= *) + +needs "x86/proofs/base.ml";; + +needs "common/mlkem_specs.ml";; + +let mlkem_basemul_k3_mc = + define_assert_from_elf "mlkem_basemul_k3_mc" "x86/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k3.o" +(*** BYTECODE START ***) +[ + 0xf3; 0x0f; 0x1e; 0xfa; (* ENDBR64 *) + 0xb8; 0x01; 0x0d; 0x01; 0x0d; + (* MOV (% eax) (Imm32 (word 218172673)) *) + 0xc5; 0xf9; 0x6e; 0xc0; (* VMOVD (%_% xmm0) (% eax) *) + 0xc4; 0xe2; 0x7d; 0x58; 0xc0; + (* VPBROADCASTD (%_% ymm0) (%_% xmm0) *) + 0xb8; 0x01; 0xf3; 0x01; 0xf3; + (* MOV (% eax) (Imm32 (word 4076991233)) *) + 0xc5; 0xf9; 0x6e; 0xc8; (* VMOVD (%_% xmm1) (% eax) *) + 0xc4; 0xe2; 0x7d; 0x58; 0xc9; + (* VPBROADCASTD (%_% ymm1) (%_% xmm1) *) + 0xc5; 0xfd; 0x6f; 0x16; (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,0))) *) + 0xc5; 0xfd; 0x6f; 0x5e; 0x20; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,32))) *) + 0xc5; 0xfd; 0x6f; 0x22; (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,0))) *) + 0xc5; 0xfd; 0x6f; 0x6a; 0x20; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,32))) *) + 0xc5; 0xfd; 0x6f; 0x31; (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,0))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x15; 0xf9; 0xc0; + (* VPSUBW (%_% ymm8) (%_% ymm13) (%_% ymm8) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0x3f; (* VMOVDQA (Memop Word256 (%% (rdi,0))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x4f; 0x20; + (* VMOVDQA (Memop Word256 (%% (rdi,32))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x56; 0x40; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,64))) *) + 0xc5; 0xfd; 0x6f; 0x5e; 0x60; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,96))) *) + 0xc5; 0xfd; 0x6f; 0x62; 0x40; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,64))) *) + 0xc5; 0xfd; 0x6f; 0x6a; 0x60; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,96))) *) + 0xc5; 0xfd; 0x6f; 0x71; 0x20; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,32))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x3d; 0xf9; 0xc5; + (* VPSUBW (%_% ymm8) (%_% ymm8) (%_% ymm13) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0x7f; 0x40; + (* VMOVDQA (Memop Word256 (%% (rdi,64))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x4f; 0x60; + (* VMOVDQA (Memop Word256 (%% (rdi,96))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,128))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,160))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,128))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,160))) *) + 0xc5; 0xfd; 0x6f; 0x71; 0x40; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,64))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x15; 0xf9; 0xc0; + (* VPSUBW (%_% ymm8) (%_% ymm13) (%_% ymm8) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,128))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,160))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0xc0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,192))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0xe0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,224))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0xc0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,192))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0xe0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,224))) *) + 0xc5; 0xfd; 0x6f; 0x71; 0x60; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,96))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x3d; 0xf9; 0xc5; + (* VPSUBW (%_% ymm8) (%_% ymm8) (%_% ymm13) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0xc0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,192))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0xe0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,224))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x00; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,256))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0x20; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,288))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x00; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,256))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0x20; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,288))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,128))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x15; 0xf9; 0xc0; + (* VPSUBW (%_% ymm8) (%_% ymm13) (%_% ymm8) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0x00; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,256))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0x20; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,288))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x40; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,320))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0x60; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,352))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x40; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,320))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0x60; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,352))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,160))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x3d; 0xf9; 0xc5; + (* VPSUBW (%_% ymm8) (%_% ymm8) (%_% ymm13) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0x40; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,320))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0x60; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,352))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x80; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,384))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0xa0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,416))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x80; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,384))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0xa0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,416))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0xc0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,192))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x15; 0xf9; 0xc0; + (* VPSUBW (%_% ymm8) (%_% ymm13) (%_% ymm8) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0x80; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,384))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0xa0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,416))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0xc0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,448))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0xe0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,480))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0xc0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,448))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0xe0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,480))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0xe0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,224))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x3d; 0xf9; 0xc5; + (* VPSUBW (%_% ymm8) (%_% ymm8) (%_% ymm13) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0xc0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,448))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0xe0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,480))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x00; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,512))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0x20; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,544))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x00; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,512))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0x20; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,544))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0x00; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,256))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x15; 0xf9; 0xc0; + (* VPSUBW (%_% ymm8) (%_% ymm13) (%_% ymm8) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x07; (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,0))) *) + 0xc5; 0x7d; 0x6f; 0x57; 0x20; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,32))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0x3f; (* VMOVDQA (Memop Word256 (%% (rdi,0))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x4f; 0x20; + (* VMOVDQA (Memop Word256 (%% (rdi,32))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x40; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,576))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0x60; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,608))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x40; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,576))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0x60; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,608))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0x20; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,288))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x3d; 0xf9; 0xc5; + (* VPSUBW (%_% ymm8) (%_% ymm8) (%_% ymm13) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x47; 0x40; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,64))) *) + 0xc5; 0x7d; 0x6f; 0x57; 0x60; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,96))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0x7f; 0x40; + (* VMOVDQA (Memop Word256 (%% (rdi,64))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x4f; 0x60; + (* VMOVDQA (Memop Word256 (%% (rdi,96))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x80; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,640))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0xa0; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,672))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x80; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,640))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0xa0; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,672))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0x40; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,320))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x15; 0xf9; 0xc0; + (* VPSUBW (%_% ymm8) (%_% ymm13) (%_% ymm8) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x87; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,128))) *) + 0xc5; 0x7d; 0x6f; 0x97; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,160))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,128))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,160))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0xc0; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,704))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0xe0; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,736))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0xc0; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,704))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0xe0; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,736))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0x60; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,352))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x3d; 0xf9; 0xc5; + (* VPSUBW (%_% ymm8) (%_% ymm8) (%_% ymm13) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x87; 0xc0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,192))) *) + 0xc5; 0x7d; 0x6f; 0x97; 0xe0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,224))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0xc0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,192))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0xe0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,224))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x00; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,768))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0x20; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,800))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x00; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,768))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0x20; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,800))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0x80; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,384))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x15; 0xf9; 0xc0; + (* VPSUBW (%_% ymm8) (%_% ymm13) (%_% ymm8) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x87; 0x00; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,256))) *) + 0xc5; 0x7d; 0x6f; 0x97; 0x20; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,288))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0x00; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,256))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0x20; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,288))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x40; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,832))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0x60; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,864))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x40; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,832))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0x60; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,864))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0xa0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,416))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x3d; 0xf9; 0xc5; + (* VPSUBW (%_% ymm8) (%_% ymm8) (%_% ymm13) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x87; 0x40; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,320))) *) + 0xc5; 0x7d; 0x6f; 0x97; 0x60; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,352))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0x40; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,320))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0x60; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,352))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x80; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,896))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0xa0; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,928))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x80; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,896))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0xa0; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,928))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0xc0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,448))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x15; 0xf9; 0xc0; + (* VPSUBW (%_% ymm8) (%_% ymm13) (%_% ymm8) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x87; 0x80; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,384))) *) + 0xc5; 0x7d; 0x6f; 0x97; 0xa0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,416))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0x80; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,384))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0xa0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,416))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0xc0; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,960))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0xe0; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,992))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0xc0; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,960))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0xe0; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,992))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0xe0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,480))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x3d; 0xf9; 0xc5; + (* VPSUBW (%_% ymm8) (%_% ymm8) (%_% ymm13) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x87; 0xc0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,448))) *) + 0xc5; 0x7d; 0x6f; 0x97; 0xe0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,480))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0xc0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,448))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0xe0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,480))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x00; 0x04; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,1024))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0x20; 0x04; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,1056))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x00; 0x04; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,1024))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0x20; 0x04; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,1056))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0x00; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,512))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x15; 0xf9; 0xc0; + (* VPSUBW (%_% ymm8) (%_% ymm13) (%_% ymm8) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x07; (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,0))) *) + 0xc5; 0x7d; 0x6f; 0x57; 0x20; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,32))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0x3f; (* VMOVDQA (Memop Word256 (%% (rdi,0))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x4f; 0x20; + (* VMOVDQA (Memop Word256 (%% (rdi,32))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x40; 0x04; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,1088))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0x60; 0x04; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,1120))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x40; 0x04; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,1088))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0x60; 0x04; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,1120))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0x20; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,544))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x3d; 0xf9; 0xc5; + (* VPSUBW (%_% ymm8) (%_% ymm8) (%_% ymm13) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x47; 0x40; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,64))) *) + 0xc5; 0x7d; 0x6f; 0x57; 0x60; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,96))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0x7f; 0x40; + (* VMOVDQA (Memop Word256 (%% (rdi,64))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x4f; 0x60; + (* VMOVDQA (Memop Word256 (%% (rdi,96))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x80; 0x04; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,1152))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0xa0; 0x04; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,1184))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x80; 0x04; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,1152))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0xa0; 0x04; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,1184))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0x40; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,576))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x15; 0xf9; 0xc0; + (* VPSUBW (%_% ymm8) (%_% ymm13) (%_% ymm8) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x87; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,128))) *) + 0xc5; 0x7d; 0x6f; 0x97; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,160))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,128))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,160))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0xc0; 0x04; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,1216))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0xe0; 0x04; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,1248))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0xc0; 0x04; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,1216))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0xe0; 0x04; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,1248))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0x60; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,608))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x3d; 0xf9; 0xc5; + (* VPSUBW (%_% ymm8) (%_% ymm8) (%_% ymm13) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x87; 0xc0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,192))) *) + 0xc5; 0x7d; 0x6f; 0x97; 0xe0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,224))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0xc0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,192))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0xe0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,224))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x00; 0x05; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,1280))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0x20; 0x05; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,1312))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x00; 0x05; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,1280))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0x20; 0x05; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,1312))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0x80; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,640))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x15; 0xf9; 0xc0; + (* VPSUBW (%_% ymm8) (%_% ymm13) (%_% ymm8) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x87; 0x00; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,256))) *) + 0xc5; 0x7d; 0x6f; 0x97; 0x20; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,288))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0x00; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,256))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0x20; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,288))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x40; 0x05; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,1344))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0x60; 0x05; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,1376))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x40; 0x05; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,1344))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0x60; 0x05; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,1376))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0xa0; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,672))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x3d; 0xf9; 0xc5; + (* VPSUBW (%_% ymm8) (%_% ymm8) (%_% ymm13) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x87; 0x40; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,320))) *) + 0xc5; 0x7d; 0x6f; 0x97; 0x60; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,352))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0x40; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,320))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0x60; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,352))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x80; 0x05; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,1408))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0xa0; 0x05; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,1440))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x80; 0x05; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,1408))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0xa0; 0x05; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,1440))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0xc0; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,704))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x15; 0xf9; 0xc0; + (* VPSUBW (%_% ymm8) (%_% ymm13) (%_% ymm8) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x87; 0x80; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,384))) *) + 0xc5; 0x7d; 0x6f; 0x97; 0xa0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,416))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0x80; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,384))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0xa0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,416))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0xc0; 0x05; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,1472))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0xe0; 0x05; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,1504))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0xc0; 0x05; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,1472))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0xe0; 0x05; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,1504))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0xe0; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,736))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x3d; 0xf9; 0xc5; + (* VPSUBW (%_% ymm8) (%_% ymm8) (%_% ymm13) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x87; 0xc0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,448))) *) + 0xc5; 0x7d; 0x6f; 0x97; 0xe0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,480))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0xc0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,448))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0xe0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,480))) (%_% ymm9) *) + 0xc3 (* RET *) +];; +(*** BYTECODE END ***) + +let mlkem_basemul_k3_tmc = define_trimmed "mlkem_basemul_k3_tmc" mlkem_basemul_k3_mc;; +let mlkem_basemul_k3_tmc_EXEC = X86_MK_CORE_EXEC_RULE mlkem_basemul_k3_tmc;; + +(* Enable simplification of word_subwords by default. + Nedded to prevent the symbolic simulation to explode + as we add more instructions. *) +let org_extra_word_conv = !extra_word_CONV;; +extra_word_CONV := [WORD_SIMPLE_SUBWORD_CONV] @ !extra_word_CONV;; + + +(* (a + bX) * (c + dX) = (a*c + b*dz) + (a*d + b*c)X *) +let pmul0 = define + `pmul0 (a: int) (b : int) (c : int) (dz : int) = b*dz + a*c`;; + +let pmul0_odd = define + `pmul0_odd (a: int) (b : int) (c : int) (dz : int) = a*c - b*dz`;; + +let pmul1 = define + `pmul1 (a: int) (b : int) (c : int) (d : int) = b*c + a*d`;; + +let pmulacc0_k3 = define + `pmulacc0_k3 (a0: int) (b0 : int) (c0 : int) (d0 : int) (dz0 : int) + (a1: int) (b1 : int) (c1 : int) (d1 : int) (dz1 : int) + (a2: int) (b2 : int) (c2 : int) (d2 : int) (dz2 : int) = + pmul0 a0 b0 c0 dz0 + pmul0 a1 b1 c1 dz1 + pmul0 a2 b2 c2 dz2`;; + +let pmulacc0_odd_k3 = define + `pmulacc0_odd_k3 (a0: int) (b0 : int) (c0 : int) (d0 : int) (dz0 : int) + (a1: int) (b1 : int) (c1 : int) (d1 : int) (dz1 : int) + (a2: int) (b2 : int) (c2 : int) (d2 : int) (dz2 : int) = + pmul0_odd a0 b0 c0 dz0 + pmul0_odd a1 b1 c1 dz1 + pmul0_odd a2 b2 c2 dz2`;; + +let pmulacc1_k3 = define + `pmulacc1_k3 (a0: int) (b0 : int) (c0 : int) (d0 : int) (dz0 : int) + (a1: int) (b1 : int) (c1 : int) (d1 : int) (dz1 : int) + (a2: int) (b2 : int) (c2 : int) (d2 : int) (dz2 : int) = + pmul1 a0 b0 c0 d0 + pmul1 a1 b1 c1 d1 + pmul1 a2 b2 c2 d2`;; + +let pmulaccred0_k3 = define + `pmulaccred0_k3 (a0: int) (b0 : int) (c0 : int) (d0 : int) (dz0 : int) + (a1: int) (b1 : int) (c1 : int) (d1 : int) (dz1 : int) + (a2: int) (b2 : int) (c2 : int) (d2 : int) (dz2 : int) = + (&(inverse_mod 3329 65536) * pmulacc0_k3 a0 b0 c0 d0 dz0 a1 b1 c1 d1 dz1 a2 b2 c2 d2 dz2) rem &3329`;; + +let pmulaccred0_odd_k3 = define + `pmulaccred0_odd_k3 (a0: int) (b0 : int) (c0 : int) (d0 : int) (dz0 : int) + (a1: int) (b1 : int) (c1 : int) (d1 : int) (dz1 : int) + (a2: int) (b2 : int) (c2 : int) (d2 : int) (dz2 : int) = + (&(inverse_mod 3329 65536) * pmulacc0_odd_k3 a0 b0 c0 d0 dz0 a1 b1 c1 d1 dz1 a2 b2 c2 d2 dz2) rem &3329`;; + + +let pmulaccred1_k3 = define + `pmulaccred1_k3 (a0: int) (b0 : int) (c0 : int) (d0 : int) (dz0 : int) + (a1: int) (b1 : int) (c1 : int) (d1 : int) (dz1 : int) + (a2: int) (b2 : int) (c2 : int) (d2 : int) (dz2 : int) = + (&(inverse_mod 3329 65536) * pmulacc1_k3 a0 b0 c0 d0 dz0 a1 b1 c1 d1 dz1 a2 b2 c2 d2 dz2) rem &3329`;; + +let MLKEM_BASEMUL_K3_CORRECT = prove( + `!src1 src2 src2t dst a0 b0 c0 d0 dz0 a1 b1 c1 d1 dz1 a2 b2 c2 d2 dz2 pc. + aligned 32 src1 /\ + aligned 32 src2 /\ + aligned 32 src2t /\ + aligned 32 dst /\ + ALL (nonoverlapping (dst, 512)) [(src1, 1536); (src2, 1536); (src2t, 768)] /\ + nonoverlapping (dst, 512) (word pc, 3852) + ==> ensures x86 + (\s. bytes_loaded s (word pc) (BUTLAST mlkem_basemul_k3_tmc) /\ + read RIP s = word pc /\ + C_ARGUMENTS [dst; src1; src2; src2t] s /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (64*j + 2*i)))) s = a0 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (64*j + 32 + 2*i)))) s = b0 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (64*j + 2*i)))) s = c0 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (64*j + 32 + 2*i)))) s = d0 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2t (word (32*j + 2*i)))) s = dz0 i j) /\ + + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (512 + 64*j + 2*i)))) s = a1 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (512 + 64*j + 32 + 2*i)))) s = b1 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (512 + 64*j + 2*i)))) s = c1 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (512 + 64*j + 32 + 2*i)))) s = d1 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2t (word (256 + 32*j + 2*i)))) s = dz1 i j) /\ + + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (1024 + 64*j + 2*i)))) s = a2 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (1024 + 64*j + 32 + 2*i)))) s = b2 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (1024 + 64*j + 2*i)))) s = c2 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (1024 + 64*j + 32 + 2*i)))) s = d2 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2t (word (512 + 32*j + 2*i)))) s = dz2 i j)) + (\s. read RIP s = word (pc + 3852) /\ + (!i. i < 16 ==> !j. j < 4 + ==> (let j' = 2*j in + (abs(ival(a0 i j')) <= &2 pow 12 /\ + abs(ival(b0 i j')) <= &2 pow 12 /\ + abs(ival(a1 i j')) <= &2 pow 12 /\ + abs(ival(b1 i j')) <= &2 pow 12 /\ + abs(ival(a2 i j')) <= &2 pow 12 /\ + abs(ival(b2 i j')) <= &2 pow 12 + ==> + (ival (read(memory :> bytes16 (word_add dst (word (64*j' + 2*i)))) s) + == + pmulaccred0_k3 (ival (a0 i j')) (ival (b0 i j')) (ival (c0 i j')) (ival (d0 i j')) (ival (dz0 i j')) + (ival (a1 i j')) (ival (b1 i j')) (ival (c1 i j')) (ival (d1 i j')) (ival (dz1 i j')) + (ival (a2 i j')) (ival (b2 i j')) (ival (c2 i j')) (ival (d2 i j')) (ival (dz2 i j')) + ) (mod &3329)))) /\ + + (!i. i < 16 ==> !j. j < 4 + ==> (let j' = 2*j+1 in + (abs(ival(a0 i j')) <= &2 pow 12 /\ + abs(ival(b0 i j')) <= &2 pow 12 /\ + abs(ival(a1 i j')) <= &2 pow 12 /\ + abs(ival(b1 i j')) <= &2 pow 12 /\ + abs(ival(a2 i j')) <= &2 pow 12 /\ + abs(ival(b2 i j')) <= &2 pow 12 + ==> + (ival (read(memory :> bytes16 (word_add dst (word (64*j' + 2*i)))) s) + == + pmulaccred0_odd_k3 (ival (a0 i j')) (ival (b0 i j')) (ival (c0 i j')) (ival (d0 i j')) (ival (dz0 i j')) + (ival (a1 i j')) (ival (b1 i j')) (ival (c1 i j')) (ival (d1 i j')) (ival (dz1 i j')) + (ival (a2 i j')) (ival (b2 i j')) (ival (c2 i j')) (ival (d2 i j')) (ival (dz2 i j')) + ) (mod &3329)))) /\ + + (!i. i < 16 ==> !j. j < 8 + ==> abs(ival(a0 i j)) <= &2 pow 12 /\ + abs(ival(b0 i j)) <= &2 pow 12 /\ + abs(ival(a1 i j)) <= &2 pow 12 /\ + abs(ival(b1 i j)) <= &2 pow 12 /\ + abs(ival(a2 i j)) <= &2 pow 12 /\ + abs(ival(b2 i j)) <= &2 pow 12 + ==> (ival (read(memory :> bytes16 (word_add dst (word (64*j + 32 + 2*i)))) s) + == + pmulaccred1_k3 (ival (a0 i j)) (ival (b0 i j)) (ival (c0 i j)) (ival (d0 i j)) (ival (dz0 i j)) + (ival (a1 i j)) (ival (b1 i j)) (ival (c1 i j)) (ival (d1 i j)) (ival (dz1 i j)) + (ival (a2 i j)) (ival (b2 i j)) (ival (c2 i j)) (ival (d2 i j)) (ival (dz2 i j)) + ) (mod &3329))) + (MAYCHANGE [events] ,, + MAYCHANGE [RIP] ,, MAYCHANGE [RAX] ,, + MAYCHANGE [ZMM0; ZMM1; ZMM2; ZMM3; ZMM4; ZMM5; ZMM6; ZMM7; + ZMM8; ZMM9; ZMM10; ZMM11; ZMM12; ZMM13; ZMM14] ,, + MAYCHANGE [memory :> bytes(dst, 512)])`, + + REWRITE_TAC [MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI; + NONOVERLAPPING_CLAUSES; ALL; C_ARGUMENTS; fst mlkem_basemul_k3_tmc_EXEC] THEN + REPEAT STRIP_TAC THEN + + GHOST_INTRO_TAC `init_ymm0:int256` `read YMM0` THEN + GHOST_INTRO_TAC `init_ymm1:int256` `read YMM1` THEN + + CONV_TAC(RATOR_CONV(LAND_CONV(TOP_DEPTH_CONV EXPAND_CASES_CONV))) THEN + CONV_TAC(TOP_DEPTH_CONV NUM_MULT_CONV THENC + TOP_DEPTH_CONV NUM_ADD_CONV) THEN + + ENSURES_INIT_TAC "s0" THEN + + MEMORY_256_FROM_16_TAC "src1" 64 THEN + MEMORY_256_FROM_16_TAC "src2" 64 THEN + MEMORY_256_FROM_16_TAC "src2t" 32 THEN + + ASM_REWRITE_TAC [WORD_ADD_0] THEN + DISCARD_MATCHING_ASSUMPTIONS [`read (memory :> bytes16 any) s = x`] THEN + REPEAT STRIP_TAC THEN + + MAP_EVERY (fun n -> X86_STEPS_TAC mlkem_basemul_k3_tmc_EXEC [n] THEN + SIMD_SIMPLIFY_TAC [montmul_x86; montmul_odd_x86]) + (1--718) THEN + + ENSURES_FINAL_STATE_TAC THEN + ASM_REWRITE_TAC[] THEN + + REPEAT(FIRST_X_ASSUM(STRIP_ASSUME_TAC o + CONV_RULE(SIMD_SIMPLIFY_CONV[]) o + CONV_RULE(READ_MEMORY_SPLIT_CONV 4) o + check (can (term_match [] `read qqq s:int256 = xxx`) o concl))) THEN + + CONV_TAC(TOP_DEPTH_CONV EXPAND_CASES_CONV) THEN + CONV_TAC(DEPTH_CONV NUM_MULT_CONV THENC + DEPTH_CONV NUM_ADD_CONV THENC + DEPTH_CONV let_CONV) THEN + CONV_TAC(DEPTH_CONV NUM_MULT_CONV THENC + DEPTH_CONV NUM_ADD_CONV THENC + DEPTH_CONV let_CONV) THEN + ASM_REWRITE_TAC[WORD_ADD_0] THEN + + DISCARD_STATE_TAC "s718" THEN + + REPEAT CONJ_TAC THEN + REWRITE_TAC[pmulaccred0_k3; pmulacc0_k3; pmul0; pmulaccred0_odd_k3; + pmulacc0_odd_k3; pmul0_odd; pmulaccred1_k3; pmulacc1_k3; pmul1] THEN + STRIP_TAC THEN + ASSUM_LIST((fun ths -> W(MP_TAC o CONJUNCT1 o GEN_CONGBOUND_RULE ths o + rand o lhand o rator o snd))) THEN + REWRITE_TAC[GSYM INT_REM_EQ] THEN CONV_TAC INT_REM_DOWN_CONV THEN + MATCH_MP_TAC EQ_IMP THEN AP_TERM_TAC THEN AP_THM_TAC THEN AP_TERM_TAC THEN + CONV_TAC INT_RING +);; + +let MLKEM_BASEMUL_K3_NOIBT_SUBROUTINE_CORRECT = prove( + `!src1 src2 src2t dst a0 b0 c0 d0 dz0 a1 b1 c1 d1 dz1 a2 b2 c2 d2 dz2 pc stackpointer returnaddress. + aligned 32 src1 /\ + aligned 32 src2 /\ + aligned 32 src2t /\ + aligned 32 dst /\ + ALL (nonoverlapping (dst, 512)) [(src1, 1536); (src2, 1536); (src2t, 768)] /\ + nonoverlapping (dst, 512) (word pc, LENGTH mlkem_basemul_k3_tmc) /\ + nonoverlapping (dst, 512) (stackpointer, 8) + ==> ensures x86 + (\s. bytes_loaded s (word pc) mlkem_basemul_k3_tmc /\ + read RIP s = word pc /\ + read RSP s = stackpointer /\ + read (memory :> bytes64 stackpointer) s = returnaddress /\ + C_ARGUMENTS [dst; src1; src2; src2t] s /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (64*j + 2*i)))) s = a0 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (64*j + 32 + 2*i)))) s = b0 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (64*j + 2*i)))) s = c0 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (64*j + 32 + 2*i)))) s = d0 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2t (word (32*j + 2*i)))) s = dz0 i j) /\ + + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (512 + 64*j + 2*i)))) s = a1 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (512 + 64*j + 32 + 2*i)))) s = b1 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (512 + 64*j + 2*i)))) s = c1 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (512 + 64*j + 32 + 2*i)))) s = d1 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2t (word (256 + 32*j + 2*i)))) s = dz1 i j) /\ + + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (1024 + 64*j + 2*i)))) s = a2 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (1024 + 64*j + 32 + 2*i)))) s = b2 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (1024 + 64*j + 2*i)))) s = c2 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (1024 + 64*j + 32 + 2*i)))) s = d2 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2t (word (512 + 32*j + 2*i)))) s = dz2 i j)) + (\s. read RIP s = returnaddress /\ + read RSP s = word_add stackpointer (word 8) /\ + (!i. i < 16 ==> !j. j < 4 + ==> (let j' = 2*j in + (abs(ival(a0 i j')) <= &2 pow 12 /\ + abs(ival(b0 i j')) <= &2 pow 12 /\ + abs(ival(a1 i j')) <= &2 pow 12 /\ + abs(ival(b1 i j')) <= &2 pow 12 /\ + abs(ival(a2 i j')) <= &2 pow 12 /\ + abs(ival(b2 i j')) <= &2 pow 12 + ==> + (ival (read(memory :> bytes16 (word_add dst (word (64*j' + 2*i)))) s) + == + pmulaccred0_k3 (ival (a0 i j')) (ival (b0 i j')) (ival (c0 i j')) (ival (d0 i j')) (ival (dz0 i j')) + (ival (a1 i j')) (ival (b1 i j')) (ival (c1 i j')) (ival (d1 i j')) (ival (dz1 i j')) + (ival (a2 i j')) (ival (b2 i j')) (ival (c2 i j')) (ival (d2 i j')) (ival (dz2 i j')) + ) (mod &3329)))) /\ + + (!i. i < 16 ==> !j. j < 4 + ==> (let j' = 2*j+1 in + (abs(ival(a0 i j')) <= &2 pow 12 /\ + abs(ival(b0 i j')) <= &2 pow 12 /\ + abs(ival(a1 i j')) <= &2 pow 12 /\ + abs(ival(b1 i j')) <= &2 pow 12 /\ + abs(ival(a2 i j')) <= &2 pow 12 /\ + abs(ival(b2 i j')) <= &2 pow 12 + ==> + (ival (read(memory :> bytes16 (word_add dst (word (64*j' + 2*i)))) s) + == + pmulaccred0_odd_k3 (ival (a0 i j')) (ival (b0 i j')) (ival (c0 i j')) (ival (d0 i j')) (ival (dz0 i j')) + (ival (a1 i j')) (ival (b1 i j')) (ival (c1 i j')) (ival (d1 i j')) (ival (dz1 i j')) + (ival (a2 i j')) (ival (b2 i j')) (ival (c2 i j')) (ival (d2 i j')) (ival (dz2 i j')) + ) (mod &3329)))) /\ + + (!i. i < 16 ==> !j. j < 8 + ==> abs(ival(a0 i j)) <= &2 pow 12 /\ + abs(ival(b0 i j)) <= &2 pow 12 /\ + abs(ival(a1 i j)) <= &2 pow 12 /\ + abs(ival(b1 i j)) <= &2 pow 12 /\ + abs(ival(a2 i j)) <= &2 pow 12 /\ + abs(ival(b2 i j)) <= &2 pow 12 + ==> (ival (read(memory :> bytes16 (word_add dst (word (64*j + 32 + 2*i)))) s) + == + pmulaccred1_k3 (ival (a0 i j)) (ival (b0 i j)) (ival (c0 i j)) (ival (d0 i j)) (ival (dz0 i j)) + (ival (a1 i j)) (ival (b1 i j)) (ival (c1 i j)) (ival (d1 i j)) (ival (dz1 i j)) + (ival (a2 i j)) (ival (b2 i j)) (ival (c2 i j)) (ival (d2 i j)) (ival (dz2 i j)) + ) (mod &3329))) + (MAYCHANGE [RSP] ,, MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [memory :> bytes(dst, 512)])`, + X86_PROMOTE_RETURN_NOSTACK_TAC mlkem_basemul_k3_tmc MLKEM_BASEMUL_K3_CORRECT);; + +let MLKEM_BASEMUL_K3_SUBROUTINE_CORRECT = prove( + `!src1 src2 src2t dst a0 b0 c0 d0 dz0 a1 b1 c1 d1 dz1 a2 b2 c2 d2 dz2 pc stackpointer returnaddress. + aligned 32 src1 /\ + aligned 32 src2 /\ + aligned 32 src2t /\ + aligned 32 dst /\ + ALL (nonoverlapping (dst, 512)) [(src1, 1536); (src2, 1536); (src2t, 768)] /\ + nonoverlapping (dst, 512) (word pc, LENGTH mlkem_basemul_k3_mc) /\ + nonoverlapping (dst, 512) (stackpointer, 8) + ==> ensures x86 + (\s. bytes_loaded s (word pc) mlkem_basemul_k3_mc /\ + read RIP s = word pc /\ + read RSP s = stackpointer /\ + read (memory :> bytes64 stackpointer) s = returnaddress /\ + C_ARGUMENTS [dst; src1; src2; src2t] s /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (64*j + 2*i)))) s = a0 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (64*j + 32 + 2*i)))) s = b0 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (64*j + 2*i)))) s = c0 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (64*j + 32 + 2*i)))) s = d0 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2t (word (32*j + 2*i)))) s = dz0 i j) /\ + + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (512 + 64*j + 2*i)))) s = a1 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (512 + 64*j + 32 + 2*i)))) s = b1 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (512 + 64*j + 2*i)))) s = c1 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (512 + 64*j + 32 + 2*i)))) s = d1 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2t (word (256 + 32*j + 2*i)))) s = dz1 i j) /\ + + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (1024 + 64*j + 2*i)))) s = a2 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (1024 + 64*j + 32 + 2*i)))) s = b2 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (1024 + 64*j + 2*i)))) s = c2 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (1024 + 64*j + 32 + 2*i)))) s = d2 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2t (word (512 + 32*j + 2*i)))) s = dz2 i j)) + (\s. read RIP s = returnaddress /\ + read RSP s = word_add stackpointer (word 8) /\ + (!i. i < 16 ==> !j. j < 4 + ==> (let j' = 2*j in + (abs(ival(a0 i j')) <= &2 pow 12 /\ + abs(ival(b0 i j')) <= &2 pow 12 /\ + abs(ival(a1 i j')) <= &2 pow 12 /\ + abs(ival(b1 i j')) <= &2 pow 12 /\ + abs(ival(a2 i j')) <= &2 pow 12 /\ + abs(ival(b2 i j')) <= &2 pow 12 + ==> + (ival (read(memory :> bytes16 (word_add dst (word (64*j' + 2*i)))) s) + == + pmulaccred0_k3 (ival (a0 i j')) (ival (b0 i j')) (ival (c0 i j')) (ival (d0 i j')) (ival (dz0 i j')) + (ival (a1 i j')) (ival (b1 i j')) (ival (c1 i j')) (ival (d1 i j')) (ival (dz1 i j')) + (ival (a2 i j')) (ival (b2 i j')) (ival (c2 i j')) (ival (d2 i j')) (ival (dz2 i j')) + ) (mod &3329)))) /\ + + (!i. i < 16 ==> !j. j < 4 + ==> (let j' = 2*j+1 in + (abs(ival(a0 i j')) <= &2 pow 12 /\ + abs(ival(b0 i j')) <= &2 pow 12 /\ + abs(ival(a1 i j')) <= &2 pow 12 /\ + abs(ival(b1 i j')) <= &2 pow 12 /\ + abs(ival(a2 i j')) <= &2 pow 12 /\ + abs(ival(b2 i j')) <= &2 pow 12 + ==> + (ival (read(memory :> bytes16 (word_add dst (word (64*j' + 2*i)))) s) + == + pmulaccred0_odd_k3 (ival (a0 i j')) (ival (b0 i j')) (ival (c0 i j')) (ival (d0 i j')) (ival (dz0 i j')) + (ival (a1 i j')) (ival (b1 i j')) (ival (c1 i j')) (ival (d1 i j')) (ival (dz1 i j')) + (ival (a2 i j')) (ival (b2 i j')) (ival (c2 i j')) (ival (d2 i j')) (ival (dz2 i j')) + ) (mod &3329)))) /\ + + (!i. i < 16 ==> !j. j < 8 + ==> abs(ival(a0 i j)) <= &2 pow 12 /\ + abs(ival(b0 i j)) <= &2 pow 12 /\ + abs(ival(a1 i j)) <= &2 pow 12 /\ + abs(ival(b1 i j)) <= &2 pow 12 /\ + abs(ival(a2 i j)) <= &2 pow 12 /\ + abs(ival(b2 i j)) <= &2 pow 12 + ==> (ival (read(memory :> bytes16 (word_add dst (word (64*j + 32 + 2*i)))) s) + == + pmulaccred1_k3 (ival (a0 i j)) (ival (b0 i j)) (ival (c0 i j)) (ival (d0 i j)) (ival (dz0 i j)) + (ival (a1 i j)) (ival (b1 i j)) (ival (c1 i j)) (ival (d1 i j)) (ival (dz1 i j)) + (ival (a2 i j)) (ival (b2 i j)) (ival (c2 i j)) (ival (d2 i j)) (ival (dz2 i j)) + ) (mod &3329))) + (MAYCHANGE [RSP] ,, MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [memory :> bytes(dst, 512)])`, + MATCH_ACCEPT_TAC(ADD_IBT_RULE MLKEM_BASEMUL_K3_NOIBT_SUBROUTINE_CORRECT));; + diff --git a/proofs/hol_light/x86/proofs/mlkem_poly_basemul_acc_montgomery_cached_k4.ml b/proofs/hol_light/x86/proofs/mlkem_poly_basemul_acc_montgomery_cached_k4.ml new file mode 100644 index 0000000000..081ef265ea --- /dev/null +++ b/proofs/hol_light/x86/proofs/mlkem_poly_basemul_acc_montgomery_cached_k4.ml @@ -0,0 +1,2166 @@ +(* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + *) + +(* ========================================================================= *) +(* Scalar multiplication of 2-element polynomial vectors in NTT domain. *) +(* ========================================================================= *) + +needs "x86/proofs/base.ml";; +needs "common/mlkem_specs.ml";; + +let mlkem_basemul_k4_mc = + define_assert_from_elf "mlkem_basemul_k4_mc" "x86/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k4.o" +(*** BYTECODE START ***) +[ + 0xf3; 0x0f; 0x1e; 0xfa; (* ENDBR64 *) + 0xb8; 0x01; 0x0d; 0x01; 0x0d; + (* MOV (% eax) (Imm32 (word 218172673)) *) + 0xc5; 0xf9; 0x6e; 0xc0; (* VMOVD (%_% xmm0) (% eax) *) + 0xc4; 0xe2; 0x7d; 0x58; 0xc0; + (* VPBROADCASTD (%_% ymm0) (%_% xmm0) *) + 0xb8; 0x01; 0xf3; 0x01; 0xf3; + (* MOV (% eax) (Imm32 (word 4076991233)) *) + 0xc5; 0xf9; 0x6e; 0xc8; (* VMOVD (%_% xmm1) (% eax) *) + 0xc4; 0xe2; 0x7d; 0x58; 0xc9; + (* VPBROADCASTD (%_% ymm1) (%_% xmm1) *) + 0xc5; 0xfd; 0x6f; 0x16; (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,0))) *) + 0xc5; 0xfd; 0x6f; 0x5e; 0x20; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,32))) *) + 0xc5; 0xfd; 0x6f; 0x22; (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,0))) *) + 0xc5; 0xfd; 0x6f; 0x6a; 0x20; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,32))) *) + 0xc5; 0xfd; 0x6f; 0x31; (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,0))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x15; 0xf9; 0xc0; + (* VPSUBW (%_% ymm8) (%_% ymm13) (%_% ymm8) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0x3f; (* VMOVDQA (Memop Word256 (%% (rdi,0))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x4f; 0x20; + (* VMOVDQA (Memop Word256 (%% (rdi,32))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x56; 0x40; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,64))) *) + 0xc5; 0xfd; 0x6f; 0x5e; 0x60; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,96))) *) + 0xc5; 0xfd; 0x6f; 0x62; 0x40; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,64))) *) + 0xc5; 0xfd; 0x6f; 0x6a; 0x60; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,96))) *) + 0xc5; 0xfd; 0x6f; 0x71; 0x20; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,32))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x3d; 0xf9; 0xc5; + (* VPSUBW (%_% ymm8) (%_% ymm8) (%_% ymm13) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0x7f; 0x40; + (* VMOVDQA (Memop Word256 (%% (rdi,64))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x4f; 0x60; + (* VMOVDQA (Memop Word256 (%% (rdi,96))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,128))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,160))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,128))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,160))) *) + 0xc5; 0xfd; 0x6f; 0x71; 0x40; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,64))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x15; 0xf9; 0xc0; + (* VPSUBW (%_% ymm8) (%_% ymm13) (%_% ymm8) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,128))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,160))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0xc0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,192))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0xe0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,224))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0xc0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,192))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0xe0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,224))) *) + 0xc5; 0xfd; 0x6f; 0x71; 0x60; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,96))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x3d; 0xf9; 0xc5; + (* VPSUBW (%_% ymm8) (%_% ymm8) (%_% ymm13) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0xc0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,192))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0xe0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,224))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x00; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,256))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0x20; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,288))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x00; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,256))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0x20; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,288))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,128))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x15; 0xf9; 0xc0; + (* VPSUBW (%_% ymm8) (%_% ymm13) (%_% ymm8) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0x00; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,256))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0x20; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,288))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x40; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,320))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0x60; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,352))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x40; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,320))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0x60; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,352))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,160))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x3d; 0xf9; 0xc5; + (* VPSUBW (%_% ymm8) (%_% ymm8) (%_% ymm13) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0x40; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,320))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0x60; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,352))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x80; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,384))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0xa0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,416))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x80; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,384))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0xa0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,416))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0xc0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,192))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x15; 0xf9; 0xc0; + (* VPSUBW (%_% ymm8) (%_% ymm13) (%_% ymm8) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0x80; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,384))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0xa0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,416))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0xc0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,448))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0xe0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,480))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0xc0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,448))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0xe0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,480))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0xe0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,224))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x3d; 0xf9; 0xc5; + (* VPSUBW (%_% ymm8) (%_% ymm8) (%_% ymm13) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0xc0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,448))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0xe0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,480))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x00; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,512))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0x20; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,544))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x00; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,512))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0x20; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,544))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0x00; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,256))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x15; 0xf9; 0xc0; + (* VPSUBW (%_% ymm8) (%_% ymm13) (%_% ymm8) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x07; (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,0))) *) + 0xc5; 0x7d; 0x6f; 0x57; 0x20; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,32))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0x3f; (* VMOVDQA (Memop Word256 (%% (rdi,0))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x4f; 0x20; + (* VMOVDQA (Memop Word256 (%% (rdi,32))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x40; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,576))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0x60; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,608))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x40; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,576))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0x60; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,608))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0x20; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,288))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x3d; 0xf9; 0xc5; + (* VPSUBW (%_% ymm8) (%_% ymm8) (%_% ymm13) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x47; 0x40; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,64))) *) + 0xc5; 0x7d; 0x6f; 0x57; 0x60; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,96))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0x7f; 0x40; + (* VMOVDQA (Memop Word256 (%% (rdi,64))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x4f; 0x60; + (* VMOVDQA (Memop Word256 (%% (rdi,96))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x80; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,640))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0xa0; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,672))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x80; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,640))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0xa0; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,672))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0x40; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,320))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x15; 0xf9; 0xc0; + (* VPSUBW (%_% ymm8) (%_% ymm13) (%_% ymm8) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x87; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,128))) *) + 0xc5; 0x7d; 0x6f; 0x97; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,160))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,128))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,160))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0xc0; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,704))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0xe0; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,736))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0xc0; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,704))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0xe0; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,736))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0x60; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,352))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x3d; 0xf9; 0xc5; + (* VPSUBW (%_% ymm8) (%_% ymm8) (%_% ymm13) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x87; 0xc0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,192))) *) + 0xc5; 0x7d; 0x6f; 0x97; 0xe0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,224))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0xc0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,192))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0xe0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,224))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x00; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,768))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0x20; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,800))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x00; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,768))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0x20; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,800))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0x80; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,384))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x15; 0xf9; 0xc0; + (* VPSUBW (%_% ymm8) (%_% ymm13) (%_% ymm8) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x87; 0x00; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,256))) *) + 0xc5; 0x7d; 0x6f; 0x97; 0x20; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,288))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0x00; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,256))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0x20; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,288))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x40; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,832))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0x60; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,864))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x40; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,832))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0x60; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,864))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0xa0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,416))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x3d; 0xf9; 0xc5; + (* VPSUBW (%_% ymm8) (%_% ymm8) (%_% ymm13) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x87; 0x40; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,320))) *) + 0xc5; 0x7d; 0x6f; 0x97; 0x60; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,352))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0x40; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,320))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0x60; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,352))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x80; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,896))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0xa0; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,928))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x80; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,896))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0xa0; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,928))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0xc0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,448))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x15; 0xf9; 0xc0; + (* VPSUBW (%_% ymm8) (%_% ymm13) (%_% ymm8) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x87; 0x80; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,384))) *) + 0xc5; 0x7d; 0x6f; 0x97; 0xa0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,416))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0x80; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,384))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0xa0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,416))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0xc0; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,960))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0xe0; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,992))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0xc0; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,960))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0xe0; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,992))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0xe0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,480))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x3d; 0xf9; 0xc5; + (* VPSUBW (%_% ymm8) (%_% ymm8) (%_% ymm13) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x87; 0xc0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,448))) *) + 0xc5; 0x7d; 0x6f; 0x97; 0xe0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,480))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0xc0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,448))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0xe0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,480))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x00; 0x04; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,1024))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0x20; 0x04; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,1056))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x00; 0x04; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,1024))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0x20; 0x04; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,1056))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0x00; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,512))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x15; 0xf9; 0xc0; + (* VPSUBW (%_% ymm8) (%_% ymm13) (%_% ymm8) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x07; (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,0))) *) + 0xc5; 0x7d; 0x6f; 0x57; 0x20; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,32))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0x3f; (* VMOVDQA (Memop Word256 (%% (rdi,0))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x4f; 0x20; + (* VMOVDQA (Memop Word256 (%% (rdi,32))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x40; 0x04; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,1088))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0x60; 0x04; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,1120))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x40; 0x04; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,1088))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0x60; 0x04; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,1120))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0x20; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,544))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x3d; 0xf9; 0xc5; + (* VPSUBW (%_% ymm8) (%_% ymm8) (%_% ymm13) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x47; 0x40; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,64))) *) + 0xc5; 0x7d; 0x6f; 0x57; 0x60; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,96))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0x7f; 0x40; + (* VMOVDQA (Memop Word256 (%% (rdi,64))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x4f; 0x60; + (* VMOVDQA (Memop Word256 (%% (rdi,96))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x80; 0x04; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,1152))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0xa0; 0x04; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,1184))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x80; 0x04; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,1152))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0xa0; 0x04; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,1184))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0x40; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,576))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x15; 0xf9; 0xc0; + (* VPSUBW (%_% ymm8) (%_% ymm13) (%_% ymm8) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x87; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,128))) *) + 0xc5; 0x7d; 0x6f; 0x97; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,160))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,128))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,160))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0xc0; 0x04; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,1216))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0xe0; 0x04; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,1248))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0xc0; 0x04; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,1216))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0xe0; 0x04; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,1248))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0x60; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,608))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x3d; 0xf9; 0xc5; + (* VPSUBW (%_% ymm8) (%_% ymm8) (%_% ymm13) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x87; 0xc0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,192))) *) + 0xc5; 0x7d; 0x6f; 0x97; 0xe0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,224))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0xc0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,192))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0xe0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,224))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x00; 0x05; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,1280))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0x20; 0x05; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,1312))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x00; 0x05; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,1280))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0x20; 0x05; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,1312))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0x80; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,640))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x15; 0xf9; 0xc0; + (* VPSUBW (%_% ymm8) (%_% ymm13) (%_% ymm8) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x87; 0x00; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,256))) *) + 0xc5; 0x7d; 0x6f; 0x97; 0x20; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,288))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0x00; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,256))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0x20; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,288))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x40; 0x05; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,1344))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0x60; 0x05; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,1376))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x40; 0x05; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,1344))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0x60; 0x05; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,1376))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0xa0; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,672))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x3d; 0xf9; 0xc5; + (* VPSUBW (%_% ymm8) (%_% ymm8) (%_% ymm13) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x87; 0x40; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,320))) *) + 0xc5; 0x7d; 0x6f; 0x97; 0x60; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,352))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0x40; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,320))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0x60; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,352))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x80; 0x05; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,1408))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0xa0; 0x05; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,1440))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x80; 0x05; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,1408))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0xa0; 0x05; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,1440))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0xc0; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,704))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x15; 0xf9; 0xc0; + (* VPSUBW (%_% ymm8) (%_% ymm13) (%_% ymm8) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x87; 0x80; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,384))) *) + 0xc5; 0x7d; 0x6f; 0x97; 0xa0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,416))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0x80; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,384))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0xa0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,416))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0xc0; 0x05; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,1472))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0xe0; 0x05; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,1504))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0xc0; 0x05; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,1472))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0xe0; 0x05; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,1504))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0xe0; 0x02; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,736))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x3d; 0xf9; 0xc5; + (* VPSUBW (%_% ymm8) (%_% ymm8) (%_% ymm13) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x87; 0xc0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,448))) *) + 0xc5; 0x7d; 0x6f; 0x97; 0xe0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,480))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0xc0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,448))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0xe0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,480))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x00; 0x06; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,1536))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0x20; 0x06; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,1568))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x00; 0x06; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,1536))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0x20; 0x06; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,1568))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0x00; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,768))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x15; 0xf9; 0xc0; + (* VPSUBW (%_% ymm8) (%_% ymm13) (%_% ymm8) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x07; (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,0))) *) + 0xc5; 0x7d; 0x6f; 0x57; 0x20; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,32))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0x3f; (* VMOVDQA (Memop Word256 (%% (rdi,0))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x4f; 0x20; + (* VMOVDQA (Memop Word256 (%% (rdi,32))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x40; 0x06; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,1600))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0x60; 0x06; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,1632))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x40; 0x06; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,1600))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0x60; 0x06; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,1632))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0x20; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,800))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x3d; 0xf9; 0xc5; + (* VPSUBW (%_% ymm8) (%_% ymm8) (%_% ymm13) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x47; 0x40; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,64))) *) + 0xc5; 0x7d; 0x6f; 0x57; 0x60; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,96))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0x7f; 0x40; + (* VMOVDQA (Memop Word256 (%% (rdi,64))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x4f; 0x60; + (* VMOVDQA (Memop Word256 (%% (rdi,96))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x80; 0x06; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,1664))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0xa0; 0x06; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,1696))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x80; 0x06; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,1664))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0xa0; 0x06; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,1696))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0x40; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,832))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x15; 0xf9; 0xc0; + (* VPSUBW (%_% ymm8) (%_% ymm13) (%_% ymm8) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x87; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,128))) *) + 0xc5; 0x7d; 0x6f; 0x97; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,160))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,128))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,160))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0xc0; 0x06; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,1728))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0xe0; 0x06; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,1760))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0xc0; 0x06; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,1728))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0xe0; 0x06; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,1760))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0x60; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,864))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x3d; 0xf9; 0xc5; + (* VPSUBW (%_% ymm8) (%_% ymm8) (%_% ymm13) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x87; 0xc0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,192))) *) + 0xc5; 0x7d; 0x6f; 0x97; 0xe0; 0x00; 0x00; 0x00; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,224))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0xc0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,192))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0xe0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,224))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x00; 0x07; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,1792))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0x20; 0x07; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,1824))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x00; 0x07; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,1792))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0x20; 0x07; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,1824))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0x80; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,896))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x15; 0xf9; 0xc0; + (* VPSUBW (%_% ymm8) (%_% ymm13) (%_% ymm8) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x87; 0x00; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,256))) *) + 0xc5; 0x7d; 0x6f; 0x97; 0x20; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,288))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0x00; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,256))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0x20; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,288))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x40; 0x07; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,1856))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0x60; 0x07; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,1888))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x40; 0x07; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,1856))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0x60; 0x07; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,1888))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0xa0; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,928))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x3d; 0xf9; 0xc5; + (* VPSUBW (%_% ymm8) (%_% ymm8) (%_% ymm13) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x87; 0x40; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,320))) *) + 0xc5; 0x7d; 0x6f; 0x97; 0x60; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,352))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0x40; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,320))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0x60; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,352))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0x80; 0x07; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,1920))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0xa0; 0x07; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,1952))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0x80; 0x07; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,1920))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0xa0; 0x07; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,1952))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0xc0; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,960))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x15; 0xf9; 0xc0; + (* VPSUBW (%_% ymm8) (%_% ymm13) (%_% ymm8) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x87; 0x80; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,384))) *) + 0xc5; 0x7d; 0x6f; 0x97; 0xa0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,416))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0x80; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,384))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0xa0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,416))) (%_% ymm9) *) + 0xc5; 0xfd; 0x6f; 0x96; 0xc0; 0x07; 0x00; 0x00; + (* VMOVDQA (%_% ymm2) (Memop Word256 (%% (rsi,1984))) *) + 0xc5; 0xfd; 0x6f; 0x9e; 0xe0; 0x07; 0x00; 0x00; + (* VMOVDQA (%_% ymm3) (Memop Word256 (%% (rsi,2016))) *) + 0xc5; 0xfd; 0x6f; 0xa2; 0xc0; 0x07; 0x00; 0x00; + (* VMOVDQA (%_% ymm4) (Memop Word256 (%% (rdx,1984))) *) + 0xc5; 0xfd; 0x6f; 0xaa; 0xe0; 0x07; 0x00; 0x00; + (* VMOVDQA (%_% ymm5) (Memop Word256 (%% (rdx,2016))) *) + 0xc5; 0xfd; 0x6f; 0xb1; 0xe0; 0x03; 0x00; 0x00; + (* VMOVDQA (%_% ymm6) (Memop Word256 (%% (rcx,992))) *) + 0xc5; 0x75; 0xd5; 0xea; (* VPMULLW (%_% ymm13) (%_% ymm1) (%_% ymm2) *) + 0xc5; 0x75; 0xd5; 0xf3; (* VPMULLW (%_% ymm14) (%_% ymm1) (%_% ymm3) *) + 0xc4; 0xc1; 0x5d; 0xd5; 0xfd; + (* VPMULLW (%_% ymm7) (%_% ymm4) (%_% ymm13) *) + 0xc4; 0x41; 0x55; 0xd5; 0xcd; + (* VPMULLW (%_% ymm9) (%_% ymm5) (%_% ymm13) *) + 0xc4; 0x41; 0x4d; 0xd5; 0xc6; + (* VPMULLW (%_% ymm8) (%_% ymm6) (%_% ymm14) *) + 0xc4; 0x41; 0x5d; 0xd5; 0xd6; + (* VPMULLW (%_% ymm10) (%_% ymm4) (%_% ymm14) *) + 0xc5; 0xfd; 0xe5; 0xff; (* VPMULHW (%_% ymm7) (%_% ymm0) (%_% ymm7) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc9; + (* VPMULHW (%_% ymm9) (%_% ymm0) (%_% ymm9) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xc0; + (* VPMULHW (%_% ymm8) (%_% ymm0) (%_% ymm8) *) + 0xc4; 0x41; 0x7d; 0xe5; 0xd2; + (* VPMULHW (%_% ymm10) (%_% ymm0) (%_% ymm10) *) + 0xc5; 0x5d; 0xe5; 0xda; (* VPMULHW (%_% ymm11) (%_% ymm4) (%_% ymm2) *) + 0xc5; 0x55; 0xe5; 0xe2; (* VPMULHW (%_% ymm12) (%_% ymm5) (%_% ymm2) *) + 0xc5; 0x4d; 0xe5; 0xeb; (* VPMULHW (%_% ymm13) (%_% ymm6) (%_% ymm3) *) + 0xc5; 0x5d; 0xe5; 0xf3; (* VPMULHW (%_% ymm14) (%_% ymm4) (%_% ymm3) *) + 0xc5; 0xa5; 0xf9; 0xff; (* VPSUBW (%_% ymm7) (%_% ymm11) (%_% ymm7) *) + 0xc4; 0x41; 0x1d; 0xf9; 0xc9; + (* VPSUBW (%_% ymm9) (%_% ymm12) (%_% ymm9) *) + 0xc4; 0x41; 0x3d; 0xf9; 0xc5; + (* VPSUBW (%_% ymm8) (%_% ymm8) (%_% ymm13) *) + 0xc4; 0x41; 0x0d; 0xf9; 0xd2; + (* VPSUBW (%_% ymm10) (%_% ymm14) (%_% ymm10) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0x7d; 0x6f; 0x87; 0xc0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm8) (Memop Word256 (%% (rdi,448))) *) + 0xc5; 0x7d; 0x6f; 0x97; 0xe0; 0x01; 0x00; 0x00; + (* VMOVDQA (%_% ymm10) (Memop Word256 (%% (rdi,480))) *) + 0xc5; 0xbd; 0xfd; 0xff; (* VPADDW (%_% ymm7) (%_% ymm8) (%_% ymm7) *) + 0xc4; 0x41; 0x2d; 0xfd; 0xc9; + (* VPADDW (%_% ymm9) (%_% ymm10) (%_% ymm9) *) + 0xc5; 0xfd; 0x7f; 0xbf; 0xc0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,448))) (%_% ymm7) *) + 0xc5; 0x7d; 0x7f; 0x8f; 0xe0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,480))) (%_% ymm9) *) + 0xc3 (* RET *) +];; +(*** BYTECODE END ***) + + +let mlkem_basemul_k4_tmc = define_trimmed "mlkem_basemul_k4_tmc" mlkem_basemul_k4_mc;; +let mlkem_basemul_k4_tmc_EXEC = X86_MK_CORE_EXEC_RULE mlkem_basemul_k4_tmc;; + +(* Enable simplification of word_subwords by default. + Nedded to prevent the symbolic simulation to explode + as we add more instructions. *) +let org_extra_word_conv = !extra_word_CONV;; +extra_word_CONV := [WORD_SIMPLE_SUBWORD_CONV] @ !extra_word_CONV;; + + +(* (a + bX) * (c + dX) = (a*c + b*dz) + (a*d + b*c)X *) +let pmul0 = define + `pmul0 (a: int) (b : int) (c : int) (dz : int) = b*dz + a*c`;; + +let pmul0_odd = define + `pmul0_odd (a: int) (b : int) (c : int) (dz : int) = a*c - b*dz`;; + +let pmul1 = define + `pmul1 (a: int) (b : int) (c : int) (d : int) = b*c + a*d`;; + +let pmulacc0_k4 = define + `pmulacc0_k4 (a0: int) (b0 : int) (c0 : int) (d0 : int) (dz0 : int) + (a1: int) (b1 : int) (c1 : int) (d1 : int) (dz1 : int) + (a2: int) (b2 : int) (c2 : int) (d2 : int) (dz2 : int) + (a3: int) (b3 : int) (c3 : int) (d3 : int) (dz3 : int) = + pmul0 a0 b0 c0 dz0 + pmul0 a1 b1 c1 dz1 + + pmul0 a2 b2 c2 dz2 + pmul0 a3 b3 c3 dz3`;; + +let pmulacc0_odd_k4 = define + `pmulacc0_odd_k4 (a0: int) (b0 : int) (c0 : int) (d0 : int) (dz0 : int) + (a1: int) (b1 : int) (c1 : int) (d1 : int) (dz1 : int) + (a2: int) (b2 : int) (c2 : int) (d2 : int) (dz2 : int) + (a3: int) (b3 : int) (c3 : int) (d3 : int) (dz3 : int) = + pmul0_odd a0 b0 c0 dz0 + pmul0_odd a1 b1 c1 dz1 + + pmul0_odd a2 b2 c2 dz2 + pmul0_odd a3 b3 c3 dz3`;; + +let pmulacc1_k4 = define + `pmulacc1_k4 (a0: int) (b0 : int) (c0 : int) (d0 : int) (dz0 : int) + (a1: int) (b1 : int) (c1 : int) (d1 : int) (dz1 : int) + (a2: int) (b2 : int) (c2 : int) (d2 : int) (dz2 : int) + (a3: int) (b3 : int) (c3 : int) (d3 : int) (dz3 : int) = + pmul1 a0 b0 c0 d0 + pmul1 a1 b1 c1 d1 + + pmul1 a2 b2 c2 d2 + pmul1 a3 b3 c3 d3`;; + +let pmulaccred0_k4 = define + `pmulaccred0_k4 (a0: int) (b0 : int) (c0 : int) (d0 : int) (dz0 : int) + (a1: int) (b1 : int) (c1 : int) (d1 : int) (dz1 : int) + (a2: int) (b2 : int) (c2 : int) (d2 : int) (dz2 : int) + (a3: int) (b3 : int) (c3 : int) (d3 : int) (dz3 : int) = + (&(inverse_mod 3329 65536) * + pmulacc0_k4 a0 b0 c0 d0 dz0 a1 b1 c1 d1 dz1 a2 b2 c2 d2 dz2 a3 b3 c3 d3 dz3) rem &3329`;; + +let pmulaccred0_odd_k4 = define + `pmulaccred0_odd_k4 (a0: int) (b0 : int) (c0 : int) (d0 : int) (dz0 : int) + (a1: int) (b1 : int) (c1 : int) (d1 : int) (dz1 : int) + (a2: int) (b2 : int) (c2 : int) (d2 : int) (dz2 : int) + (a3: int) (b3 : int) (c3 : int) (d3 : int) (dz3 : int) = + (&(inverse_mod 3329 65536) * + pmulacc0_odd_k4 a0 b0 c0 d0 dz0 a1 b1 c1 d1 dz1 a2 b2 c2 d2 dz2 a3 b3 c3 d3 dz3) rem &3329`;; + + +let pmulaccred1_k4 = define + `pmulaccred1_k4 (a0: int) (b0 : int) (c0 : int) (d0 : int) (dz0 : int) + (a1: int) (b1 : int) (c1 : int) (d1 : int) (dz1 : int) + (a2: int) (b2 : int) (c2 : int) (d2 : int) (dz2 : int) + (a3: int) (b3 : int) (c3 : int) (d3 : int) (dz3 : int) = + (&(inverse_mod 3329 65536) * + pmulacc1_k4 a0 b0 c0 d0 dz0 a1 b1 c1 d1 dz1 a2 b2 c2 d2 dz2 a3 b3 c3 d3 dz3) rem &3329`;; + +let MLKEM_BASEMUL_K4_CORRECT = prove( + `!src1 src2 src2t dst a0 b0 c0 d0 dz0 a1 b1 c1 d1 dz1 a2 b2 c2 d2 dz2 a3 b3 c3 d3 dz3 pc. + aligned 32 src1 /\ + aligned 32 src2 /\ + aligned 32 src2t /\ + aligned 32 dst /\ + ALL (nonoverlapping (dst, 512)) [(src1, 2048); (src2, 2048); (src2t, 1024)] /\ + nonoverlapping (dst, 512) (word pc, 5202) + ==> ensures x86 + (\s. bytes_loaded s (word pc) (BUTLAST mlkem_basemul_k4_tmc) /\ + read RIP s = word pc /\ + C_ARGUMENTS [dst; src1; src2; src2t] s /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (64*j + 2*i)))) s = a0 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (64*j + 32 + 2*i)))) s = b0 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (64*j + 2*i)))) s = c0 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (64*j + 32 + 2*i)))) s = d0 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2t (word (32*j + 2*i)))) s = dz0 i j) /\ + + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (512 + 64*j + 2*i)))) s = a1 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (512 + 64*j + 32 + 2*i)))) s = b1 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (512 + 64*j + 2*i)))) s = c1 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (512 + 64*j + 32 + 2*i)))) s = d1 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2t (word (256 + 32*j + 2*i)))) s = dz1 i j) /\ + + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (1024 + 64*j + 2*i)))) s = a2 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (1024 + 64*j + 32 + 2*i)))) s = b2 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (1024 + 64*j + 2*i)))) s = c2 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (1024 + 64*j + 32 + 2*i)))) s = d2 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2t (word (512 + 32*j + 2*i)))) s = dz2 i j) /\ + + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (1536 + 64*j + 2*i)))) s = a3 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (1536 + 64*j + 32 + 2*i)))) s = b3 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (1536 + 64*j + 2*i)))) s = c3 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (1536 + 64*j + 32 + 2*i)))) s = d3 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2t (word (768 + 32*j + 2*i)))) s = dz3 i j)) + (\s. read RIP s = word (pc + 5202) /\ + (!i. i < 16 ==> !j. j < 4 + ==> (let j' = 2*j in + (abs(ival(a0 i j')) <= &2 pow 12 /\ + abs(ival(b0 i j')) <= &2 pow 12 /\ + abs(ival(a1 i j')) <= &2 pow 12 /\ + abs(ival(b1 i j')) <= &2 pow 12 /\ + abs(ival(a2 i j')) <= &2 pow 12 /\ + abs(ival(b2 i j')) <= &2 pow 12 /\ + abs(ival(a3 i j')) <= &2 pow 12 /\ + abs(ival(b3 i j')) <= &2 pow 12 + ==> + (ival (read(memory :> bytes16 (word_add dst (word (64*j' + 2*i)))) s) + == + pmulaccred0_k4 (ival (a0 i j')) (ival (b0 i j')) (ival (c0 i j')) (ival (d0 i j')) (ival (dz0 i j')) + (ival (a1 i j')) (ival (b1 i j')) (ival (c1 i j')) (ival (d1 i j')) (ival (dz1 i j')) + (ival (a2 i j')) (ival (b2 i j')) (ival (c2 i j')) (ival (d2 i j')) (ival (dz2 i j')) + (ival (a3 i j')) (ival (b3 i j')) (ival (c3 i j')) (ival (d3 i j')) (ival (dz3 i j')) + ) (mod &3329)))) /\ + + (!i. i < 16 ==> !j. j < 4 + ==> (let j' = 2*j+1 in + (abs(ival(a0 i j')) <= &2 pow 12 /\ + abs(ival(b0 i j')) <= &2 pow 12 /\ + abs(ival(a1 i j')) <= &2 pow 12 /\ + abs(ival(b1 i j')) <= &2 pow 12 /\ + abs(ival(a2 i j')) <= &2 pow 12 /\ + abs(ival(b2 i j')) <= &2 pow 12 /\ + abs(ival(a3 i j')) <= &2 pow 12 /\ + abs(ival(b3 i j')) <= &2 pow 12 + ==> + (ival (read(memory :> bytes16 (word_add dst (word (64*j' + 2*i)))) s) + == + pmulaccred0_odd_k4 (ival (a0 i j')) (ival (b0 i j')) (ival (c0 i j')) (ival (d0 i j')) (ival (dz0 i j')) + (ival (a1 i j')) (ival (b1 i j')) (ival (c1 i j')) (ival (d1 i j')) (ival (dz1 i j')) + (ival (a2 i j')) (ival (b2 i j')) (ival (c2 i j')) (ival (d2 i j')) (ival (dz2 i j')) + (ival (a3 i j')) (ival (b3 i j')) (ival (c3 i j')) (ival (d3 i j')) (ival (dz3 i j')) + ) (mod &3329)))) /\ + + (!i. i < 16 ==> !j. j < 8 + ==> abs(ival(a0 i j)) <= &2 pow 12 /\ + abs(ival(b0 i j)) <= &2 pow 12 /\ + abs(ival(a1 i j)) <= &2 pow 12 /\ + abs(ival(b1 i j)) <= &2 pow 12 /\ + abs(ival(a2 i j)) <= &2 pow 12 /\ + abs(ival(b2 i j)) <= &2 pow 12 /\ + abs(ival(a3 i j)) <= &2 pow 12 /\ + abs(ival(b3 i j)) <= &2 pow 12 + ==> (ival (read(memory :> bytes16 (word_add dst (word (64*j + 32 + 2*i)))) s) + == + pmulaccred1_k4 (ival (a0 i j)) (ival (b0 i j)) (ival (c0 i j)) (ival (d0 i j)) (ival (dz0 i j)) + (ival (a1 i j)) (ival (b1 i j)) (ival (c1 i j)) (ival (d1 i j)) (ival (dz1 i j)) + (ival (a2 i j)) (ival (b2 i j)) (ival (c2 i j)) (ival (d2 i j)) (ival (dz2 i j)) + (ival (a3 i j)) (ival (b3 i j)) (ival (c3 i j)) (ival (d3 i j)) (ival (dz3 i j)) + ) (mod &3329))) + (MAYCHANGE [events] ,, + MAYCHANGE [RIP] ,, MAYCHANGE [RAX] ,, + MAYCHANGE [ZMM0; ZMM1; ZMM2; ZMM3; ZMM4; ZMM5; ZMM6; ZMM7; + ZMM8; ZMM9; ZMM10; ZMM11; ZMM12; ZMM13; ZMM14] ,, + MAYCHANGE [memory :> bytes(dst, 512)])`, + + REWRITE_TAC [MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI; + NONOVERLAPPING_CLAUSES; ALL; C_ARGUMENTS; fst mlkem_basemul_k4_tmc_EXEC] THEN + REPEAT STRIP_TAC THEN + + GHOST_INTRO_TAC `init_ymm0:int256` `read YMM0` THEN + GHOST_INTRO_TAC `init_ymm1:int256` `read YMM1` THEN + + CONV_TAC(RATOR_CONV(LAND_CONV(TOP_DEPTH_CONV EXPAND_CASES_CONV))) THEN + CONV_TAC(TOP_DEPTH_CONV NUM_MULT_CONV THENC + TOP_DEPTH_CONV NUM_ADD_CONV) THEN + + ENSURES_INIT_TAC "s0" THEN + + MEMORY_256_FROM_16_TAC "src1" 64 THEN + MEMORY_256_FROM_16_TAC "src2" 64 THEN + MEMORY_256_FROM_16_TAC "src2t" 32 THEN + + ASM_REWRITE_TAC [WORD_ADD_0] THEN + DISCARD_MATCHING_ASSUMPTIONS [`read (memory :> bytes16 any) s = x`] THEN + REPEAT STRIP_TAC THEN + + MAP_EVERY (fun n -> X86_STEPS_TAC mlkem_basemul_k4_tmc_EXEC [n] THEN + SIMD_SIMPLIFY_TAC [montmul_x86; montmul_odd_x86]) + (1--966) THEN + + ENSURES_FINAL_STATE_TAC THEN + ASM_REWRITE_TAC[] THEN + + REPEAT(FIRST_X_ASSUM(STRIP_ASSUME_TAC o + CONV_RULE(SIMD_SIMPLIFY_CONV[]) o + CONV_RULE(READ_MEMORY_SPLIT_CONV 4) o + check (can (term_match [] `read qqq s:int256 = xxx`) o concl))) THEN + + CONV_TAC(TOP_DEPTH_CONV EXPAND_CASES_CONV) THEN + CONV_TAC(DEPTH_CONV NUM_MULT_CONV THENC + DEPTH_CONV NUM_ADD_CONV THENC + DEPTH_CONV let_CONV) THEN + CONV_TAC(DEPTH_CONV NUM_MULT_CONV THENC + DEPTH_CONV NUM_ADD_CONV THENC + DEPTH_CONV let_CONV) THEN + ASM_REWRITE_TAC[WORD_ADD_0] THEN + + DISCARD_STATE_TAC "s966" THEN + + REPEAT CONJ_TAC THEN + REWRITE_TAC[pmulaccred0_k4; pmulacc0_k4; pmul0; pmulaccred0_odd_k4; + pmulacc0_odd_k4; pmul0_odd; pmulaccred1_k4; pmulacc1_k4; pmul1] THEN + STRIP_TAC THEN + ASSUM_LIST((fun ths -> W(MP_TAC o CONJUNCT1 o GEN_CONGBOUND_RULE ths o + rand o lhand o rator o snd))) THEN + REWRITE_TAC[GSYM INT_REM_EQ] THEN CONV_TAC INT_REM_DOWN_CONV THEN + MATCH_MP_TAC EQ_IMP THEN AP_TERM_TAC THEN AP_THM_TAC THEN AP_TERM_TAC THEN + CONV_TAC INT_RING +);; + +let MLKEM_BASEMUL_K4_NOIBT_SUBROUTINE_CORRECT = prove( + `!src1 src2 src2t dst a0 b0 c0 d0 dz0 a1 b1 c1 d1 dz1 a2 b2 c2 d2 dz2 a3 b3 c3 d3 dz3 pc stackpointer returnaddress. + aligned 32 src1 /\ + aligned 32 src2 /\ + aligned 32 src2t /\ + aligned 32 dst /\ + ALL (nonoverlapping (dst, 512)) [(src1, 2048); (src2, 2048); (src2t, 1024)] /\ + nonoverlapping (dst, 512) (word pc, LENGTH mlkem_basemul_k4_tmc) /\ + nonoverlapping (dst, 512) (stackpointer, 8) + ==> ensures x86 + (\s. bytes_loaded s (word pc) mlkem_basemul_k4_tmc /\ + read RIP s = word pc /\ + read RSP s = stackpointer /\ + read (memory :> bytes64 stackpointer) s = returnaddress /\ + C_ARGUMENTS [dst; src1; src2; src2t] s /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (64*j + 2*i)))) s = a0 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (64*j + 32 + 2*i)))) s = b0 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (64*j + 2*i)))) s = c0 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (64*j + 32 + 2*i)))) s = d0 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2t (word (32*j + 2*i)))) s = dz0 i j) /\ + + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (512 + 64*j + 2*i)))) s = a1 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (512 + 64*j + 32 + 2*i)))) s = b1 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (512 + 64*j + 2*i)))) s = c1 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (512 + 64*j + 32 + 2*i)))) s = d1 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2t (word (256 + 32*j + 2*i)))) s = dz1 i j) /\ + + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (1024 + 64*j + 2*i)))) s = a2 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (1024 + 64*j + 32 + 2*i)))) s = b2 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (1024 + 64*j + 2*i)))) s = c2 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (1024 + 64*j + 32 + 2*i)))) s = d2 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2t (word (512 + 32*j + 2*i)))) s = dz2 i j) /\ + + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (1536 + 64*j + 2*i)))) s = a3 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (1536 + 64*j + 32 + 2*i)))) s = b3 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (1536 + 64*j + 2*i)))) s = c3 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (1536 + 64*j + 32 + 2*i)))) s = d3 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2t (word (768 + 32*j + 2*i)))) s = dz3 i j)) + (\s. read RIP s = returnaddress /\ + read RSP s = word_add stackpointer (word 8) /\ + (!i. i < 16 ==> !j. j < 4 + ==> (let j' = 2*j in + (abs(ival(a0 i j')) <= &2 pow 12 /\ + abs(ival(b0 i j')) <= &2 pow 12 /\ + abs(ival(a1 i j')) <= &2 pow 12 /\ + abs(ival(b1 i j')) <= &2 pow 12 /\ + abs(ival(a2 i j')) <= &2 pow 12 /\ + abs(ival(b2 i j')) <= &2 pow 12 /\ + abs(ival(a3 i j')) <= &2 pow 12 /\ + abs(ival(b3 i j')) <= &2 pow 12 + ==> + (ival (read(memory :> bytes16 (word_add dst (word (64*j' + 2*i)))) s) + == + pmulaccred0_k4 (ival (a0 i j')) (ival (b0 i j')) (ival (c0 i j')) (ival (d0 i j')) (ival (dz0 i j')) + (ival (a1 i j')) (ival (b1 i j')) (ival (c1 i j')) (ival (d1 i j')) (ival (dz1 i j')) + (ival (a2 i j')) (ival (b2 i j')) (ival (c2 i j')) (ival (d2 i j')) (ival (dz2 i j')) + (ival (a3 i j')) (ival (b3 i j')) (ival (c3 i j')) (ival (d3 i j')) (ival (dz3 i j')) + ) (mod &3329)))) /\ + + (!i. i < 16 ==> !j. j < 4 + ==> (let j' = 2*j+1 in + (abs(ival(a0 i j')) <= &2 pow 12 /\ + abs(ival(b0 i j')) <= &2 pow 12 /\ + abs(ival(a1 i j')) <= &2 pow 12 /\ + abs(ival(b1 i j')) <= &2 pow 12 /\ + abs(ival(a2 i j')) <= &2 pow 12 /\ + abs(ival(b2 i j')) <= &2 pow 12 /\ + abs(ival(a3 i j')) <= &2 pow 12 /\ + abs(ival(b3 i j')) <= &2 pow 12 + ==> + (ival (read(memory :> bytes16 (word_add dst (word (64*j' + 2*i)))) s) + == + pmulaccred0_odd_k4 (ival (a0 i j')) (ival (b0 i j')) (ival (c0 i j')) (ival (d0 i j')) (ival (dz0 i j')) + (ival (a1 i j')) (ival (b1 i j')) (ival (c1 i j')) (ival (d1 i j')) (ival (dz1 i j')) + (ival (a2 i j')) (ival (b2 i j')) (ival (c2 i j')) (ival (d2 i j')) (ival (dz2 i j')) + (ival (a3 i j')) (ival (b3 i j')) (ival (c3 i j')) (ival (d3 i j')) (ival (dz3 i j')) + ) (mod &3329)))) /\ + + (!i. i < 16 ==> !j. j < 8 + ==> abs(ival(a0 i j)) <= &2 pow 12 /\ + abs(ival(b0 i j)) <= &2 pow 12 /\ + abs(ival(a1 i j)) <= &2 pow 12 /\ + abs(ival(b1 i j)) <= &2 pow 12 /\ + abs(ival(a2 i j)) <= &2 pow 12 /\ + abs(ival(b2 i j)) <= &2 pow 12 /\ + abs(ival(a3 i j)) <= &2 pow 12 /\ + abs(ival(b3 i j)) <= &2 pow 12 + ==> (ival (read(memory :> bytes16 (word_add dst (word (64*j + 32 + 2*i)))) s) + == + pmulaccred1_k4 (ival (a0 i j)) (ival (b0 i j)) (ival (c0 i j)) (ival (d0 i j)) (ival (dz0 i j)) + (ival (a1 i j)) (ival (b1 i j)) (ival (c1 i j)) (ival (d1 i j)) (ival (dz1 i j)) + (ival (a2 i j)) (ival (b2 i j)) (ival (c2 i j)) (ival (d2 i j)) (ival (dz2 i j)) + (ival (a3 i j)) (ival (b3 i j)) (ival (c3 i j)) (ival (d3 i j)) (ival (dz3 i j)) + ) (mod &3329))) + (MAYCHANGE [RSP] ,, MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [memory :> bytes(dst, 512)])`, + X86_PROMOTE_RETURN_NOSTACK_TAC mlkem_basemul_k4_tmc MLKEM_BASEMUL_K4_CORRECT);; + +let MLKEM_BASEMUL_K4_SUBROUTINE_CORRECT = prove( + `!src1 src2 src2t dst a0 b0 c0 d0 dz0 a1 b1 c1 d1 dz1 a2 b2 c2 d2 dz2 a3 b3 c3 d3 dz3 pc stackpointer returnaddress. + aligned 32 src1 /\ + aligned 32 src2 /\ + aligned 32 src2t /\ + aligned 32 dst /\ + ALL (nonoverlapping (dst, 512)) [(src1, 2048); (src2, 2048); (src2t, 1024)] /\ + nonoverlapping (dst, 512) (word pc, LENGTH mlkem_basemul_k4_mc) /\ + nonoverlapping (dst, 512) (stackpointer, 8) + ==> ensures x86 + (\s. bytes_loaded s (word pc) mlkem_basemul_k4_mc /\ + read RIP s = word pc /\ + read RSP s = stackpointer /\ + read (memory :> bytes64 stackpointer) s = returnaddress /\ + C_ARGUMENTS [dst; src1; src2; src2t] s /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (64*j + 2*i)))) s = a0 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (64*j + 32 + 2*i)))) s = b0 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (64*j + 2*i)))) s = c0 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (64*j + 32 + 2*i)))) s = d0 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2t (word (32*j + 2*i)))) s = dz0 i j) /\ + + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (512 + 64*j + 2*i)))) s = a1 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (512 + 64*j + 32 + 2*i)))) s = b1 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (512 + 64*j + 2*i)))) s = c1 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (512 + 64*j + 32 + 2*i)))) s = d1 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2t (word (256 + 32*j + 2*i)))) s = dz1 i j) /\ + + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (1024 + 64*j + 2*i)))) s = a2 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (1024 + 64*j + 32 + 2*i)))) s = b2 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (1024 + 64*j + 2*i)))) s = c2 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (1024 + 64*j + 32 + 2*i)))) s = d2 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2t (word (512 + 32*j + 2*i)))) s = dz2 i j) /\ + + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (1536 + 64*j + 2*i)))) s = a3 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src1 (word (1536 + 64*j + 32 + 2*i)))) s = b3 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (1536 + 64*j + 2*i)))) s = c3 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2 (word (1536 + 64*j + 32 + 2*i)))) s = d3 i j) /\ + (!i. i < 16 ==> !j. j < 8 + ==> read(memory :> bytes16 + (word_add src2t (word (768 + 32*j + 2*i)))) s = dz3 i j)) + (\s. read RIP s = returnaddress /\ + read RSP s = word_add stackpointer (word 8) /\ + (!i. i < 16 ==> !j. j < 4 + ==> (let j' = 2*j in + (abs(ival(a0 i j')) <= &2 pow 12 /\ + abs(ival(b0 i j')) <= &2 pow 12 /\ + abs(ival(a1 i j')) <= &2 pow 12 /\ + abs(ival(b1 i j')) <= &2 pow 12 /\ + abs(ival(a2 i j')) <= &2 pow 12 /\ + abs(ival(b2 i j')) <= &2 pow 12 /\ + abs(ival(a3 i j')) <= &2 pow 12 /\ + abs(ival(b3 i j')) <= &2 pow 12 + ==> + (ival (read(memory :> bytes16 (word_add dst (word (64*j' + 2*i)))) s) + == + pmulaccred0_k4 (ival (a0 i j')) (ival (b0 i j')) (ival (c0 i j')) (ival (d0 i j')) (ival (dz0 i j')) + (ival (a1 i j')) (ival (b1 i j')) (ival (c1 i j')) (ival (d1 i j')) (ival (dz1 i j')) + (ival (a2 i j')) (ival (b2 i j')) (ival (c2 i j')) (ival (d2 i j')) (ival (dz2 i j')) + (ival (a3 i j')) (ival (b3 i j')) (ival (c3 i j')) (ival (d3 i j')) (ival (dz3 i j')) + ) (mod &3329)))) /\ + + (!i. i < 16 ==> !j. j < 4 + ==> (let j' = 2*j+1 in + (abs(ival(a0 i j')) <= &2 pow 12 /\ + abs(ival(b0 i j')) <= &2 pow 12 /\ + abs(ival(a1 i j')) <= &2 pow 12 /\ + abs(ival(b1 i j')) <= &2 pow 12 /\ + abs(ival(a2 i j')) <= &2 pow 12 /\ + abs(ival(b2 i j')) <= &2 pow 12 /\ + abs(ival(a3 i j')) <= &2 pow 12 /\ + abs(ival(b3 i j')) <= &2 pow 12 + ==> + (ival (read(memory :> bytes16 (word_add dst (word (64*j' + 2*i)))) s) + == + pmulaccred0_odd_k4 (ival (a0 i j')) (ival (b0 i j')) (ival (c0 i j')) (ival (d0 i j')) (ival (dz0 i j')) + (ival (a1 i j')) (ival (b1 i j')) (ival (c1 i j')) (ival (d1 i j')) (ival (dz1 i j')) + (ival (a2 i j')) (ival (b2 i j')) (ival (c2 i j')) (ival (d2 i j')) (ival (dz2 i j')) + (ival (a3 i j')) (ival (b3 i j')) (ival (c3 i j')) (ival (d3 i j')) (ival (dz3 i j')) + ) (mod &3329)))) /\ + + (!i. i < 16 ==> !j. j < 8 + ==> abs(ival(a0 i j)) <= &2 pow 12 /\ + abs(ival(b0 i j)) <= &2 pow 12 /\ + abs(ival(a1 i j)) <= &2 pow 12 /\ + abs(ival(b1 i j)) <= &2 pow 12 /\ + abs(ival(a2 i j)) <= &2 pow 12 /\ + abs(ival(b2 i j)) <= &2 pow 12 /\ + abs(ival(a3 i j)) <= &2 pow 12 /\ + abs(ival(b3 i j)) <= &2 pow 12 + ==> (ival (read(memory :> bytes16 (word_add dst (word (64*j + 32 + 2*i)))) s) + == + pmulaccred1_k4 (ival (a0 i j)) (ival (b0 i j)) (ival (c0 i j)) (ival (d0 i j)) (ival (dz0 i j)) + (ival (a1 i j)) (ival (b1 i j)) (ival (c1 i j)) (ival (d1 i j)) (ival (dz1 i j)) + (ival (a2 i j)) (ival (b2 i j)) (ival (c2 i j)) (ival (d2 i j)) (ival (dz2 i j)) + (ival (a3 i j)) (ival (b3 i j)) (ival (c3 i j)) (ival (d3 i j)) (ival (dz3 i j)) + ) (mod &3329))) + (MAYCHANGE [RSP] ,, MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [memory :> bytes(dst, 512)])`, + MATCH_ACCEPT_TAC(ADD_IBT_RULE MLKEM_BASEMUL_K4_NOIBT_SUBROUTINE_CORRECT));; + diff --git a/scripts/autogen b/scripts/autogen index b984578b5b..312d0d4c12 100755 --- a/scripts/autogen +++ b/scripts/autogen @@ -2215,10 +2215,10 @@ def update_via_simpasm( def gen_hol_light_asm_file(job): - infile, outfile, indir, cflags = job + infile, outfile, indir, cflags, arm_or_x86 = job update_via_simpasm( f"{indir}/{infile}", - "proofs/hol_light/arm/mlkem", + "proofs/hol_light/" + arm_or_x86 + "/mlkem", outfile=outfile, cflags=cflags, preserve_header=False, @@ -2227,99 +2227,145 @@ def gen_hol_light_asm_file(job): def gen_hol_light_asm(): - joblist = [ + joblist_arm = [ ( "ntt.S", "mlkem_ntt.S", "dev/aarch64_opt/src", "-march=armv8.4-a+sha3 -Imlkem/src/native/aarch64/src", + "arm", ), ( "intt.S", "mlkem_intt.S", "dev/aarch64_opt/src", "-march=armv8.4-a+sha3 -Imlkem/src/native/aarch64/src", + "arm", ), ( "poly_tomont_asm.S", "mlkem_poly_tomont.S", "dev/aarch64_opt/src", "-march=armv8.4-a+sha3 -Imlkem/src/native/aarch64/src", + "arm", ), ( "poly_tobytes_asm.S", "mlkem_poly_tobytes.S", "dev/aarch64_opt/src", "-march=armv8.4-a+sha3 -Imlkem/src/native/aarch64/src", + "arm", ), ( "poly_reduce_asm.S", "mlkem_poly_reduce.S", "dev/aarch64_opt/src", "-march=armv8.4-a+sha3 -Imlkem/src/native/aarch64/src", + "arm", ), ( "poly_mulcache_compute_asm.S", "mlkem_poly_mulcache_compute.S", "dev/aarch64_opt/src", "-march=armv8.4-a+sha3 -Imlkem/src/native/aarch64/src", + "arm", ), ( "polyvec_basemul_acc_montgomery_cached_asm_k2.S", "mlkem_poly_basemul_acc_montgomery_cached_k2.S", "dev/aarch64_opt/src", "-march=armv8.4-a+sha3 -Imlkem/src/native/aarch64/src", + "arm", ), ( "polyvec_basemul_acc_montgomery_cached_asm_k3.S", "mlkem_poly_basemul_acc_montgomery_cached_k3.S", "dev/aarch64_opt/src", "-march=armv8.4-a+sha3 -Imlkem/src/native/aarch64/src", + "arm", ), ( "polyvec_basemul_acc_montgomery_cached_asm_k4.S", "mlkem_poly_basemul_acc_montgomery_cached_k4.S", "dev/aarch64_opt/src", "-march=armv8.4-a+sha3 -Imlkem/src/native/aarch64/src", + "arm", ), ( "rej_uniform_asm.S", "mlkem_rej_uniform.S", "dev/aarch64_opt/src", "-march=armv8.4-a+sha3 -Imlkem/src/native/aarch64/src", + "arm", ), ( "keccak_f1600_x1_scalar_asm.S", "keccak_f1600_x1_scalar.S", "dev/fips202/aarch64/src", "-march=armv8.4-a+sha3 -Imlkem/src/fips202/native/aarch64/src/", + "arm", ), ( "keccak_f1600_x1_v84a_asm.S", "keccak_f1600_x1_v84a.S", "dev/fips202/aarch64/src", "-march=armv8.4-a+sha3 -Imlkem/src/fips202/native/aarch64/src/", + "arm", ), ( "keccak_f1600_x2_v84a_asm.S", "keccak_f1600_x2_v84a.S", "dev/fips202/aarch64/src", "-march=armv8.4-a+sha3 -Imlkem/src/fips202/native/aarch64/src/", + "arm", ), ( "keccak_f1600_x4_v8a_v84a_scalar_hybrid_asm.S", "keccak_f1600_x4_v8a_v84a_scalar.S", "dev/fips202/aarch64/src", "-march=armv8.4-a+sha3 -Imlkem/src/fips202/native/aarch64/src/", + "arm", ), ( "keccak_f1600_x4_v8a_scalar_hybrid_asm.S", "keccak_f1600_x4_v8a_scalar.S", "dev/fips202/aarch64/src", "-march=armv8.4-a+sha3 -Imlkem/src/fips202/native/aarch64/src/", + "arm", ), ] + joblist_x86 = [ + ( + "polyvec_basemul_acc_montgomery_cached_asm_k2.S", + "mlkem_poly_basemul_acc_montgomery_cached_k2.S", + "dev/x86_64/src", + "-Imlkem/src/native/x86_64/src -Imlkem/src/common.h -mavx2 -mbmi2 -msse4 -fcf-protection=full", + "x86", + ), + ( + "polyvec_basemul_acc_montgomery_cached_asm_k3.S", + "mlkem_poly_basemul_acc_montgomery_cached_k3.S", + "dev/x86_64/src", + "-Imlkem/src/native/x86_64/src -Imlkem/src/common.h -mavx2 -mbmi2 -msse4 -fcf-protection=full", + "x86", + ), + ( + "polyvec_basemul_acc_montgomery_cached_asm_k4.S", + "mlkem_poly_basemul_acc_montgomery_cached_k4.S", + "dev/x86_64/src", + "-Imlkem/src/native/x86_64/src -Imlkem/src/common.h -mavx2 -mbmi2 -msse4 -fcf-protection=full", + "x86", + ), + ] + + if platform.machine().lower() in ["arm64", "aarch64"]: + joblist = joblist_arm + elif platform.machine().lower() in ["x86_64"]: + joblist = joblist_x86 + else: + return + with ThreadPoolExecutor() as executor: _ = list(executor.map(partial(gen_hol_light_asm_file), joblist)) @@ -3039,7 +3085,7 @@ def gen_citations(): def extract_bytecode_from_output(output_text): - """Convert output of proofs/hol_light/arm/proofs/dump_bytecode.native + """Convert output of proofs/hol_light/arm_or_x86/proofs/dump_bytecode.native into a dictionary mapping function names to byte code strings.""" bytecode_dict = {} @@ -3085,16 +3131,10 @@ def update_bytecode_in_proof_script(filepath, bytecode): update_file(filepath, updated_content) -def update_hol_light_bytecode(): - """Update HOL Light proof files with bytecode from make dump_bytecode.""" - status_update( - "HOL Light bytecode", - "Running make dump_bytecode ... (this may take a few minutes)", - ) - +def update_hol_light_bytecode_for_arch(arm_or_x86): # Run make to get bytecode output result = subprocess.run( - ["make", "-C", "proofs/hol_light/arm", "dump_bytecode"], + ["make", "-C", "proofs/hol_light/" + arm_or_x86, "dump_bytecode"], capture_output=True, text=True, check=True, @@ -3106,10 +3146,25 @@ def update_hol_light_bytecode(): # Update each .ml file for obj_name, bytecode in bytecode_dict.items(): - ml_file = "proofs/hol_light/arm/proofs/" + obj_name + ".ml" + ml_file = "proofs/hol_light/" + arm_or_x86 + "/proofs/" + obj_name + ".ml" update_bytecode_in_proof_script(ml_file, bytecode) +def update_hol_light_bytecode(): + """Update HOL Light proof files with bytecode from make dump_bytecode.""" + status_update( + "HOL Light bytecode", + "Running make dump_bytecode ... (this may take a few minutes)", + ) + + if platform.machine().lower() in ["arm64", "aarch64"]: + update_hol_light_bytecode_for_arch("arm") + elif platform.machine().lower() in ["x86_64"]: + update_hol_light_bytecode_for_arch("x86") + else: + return + + def gen_test_config(config_path, config_spec, default_config_content): """Generate a config file by modifying the default config.""" status_update("test configs", config_path) @@ -3308,7 +3363,7 @@ def _main(): gen_riscv64_zeta_files() high_level_status("Generated zeta and lookup tables") - if platform.machine().lower() in ["arm64", "aarch64"]: + if platform.machine().lower() in ["arm64", "aarch64", "x86_64"]: gen_hol_light_asm() high_level_status("Generated HOL Light assembly") diff --git a/scripts/tests b/scripts/tests index b1506135b7..1d414f5994 100755 --- a/scripts/tests +++ b/scripts/tests @@ -943,18 +943,25 @@ class Tests: def hol_light(self): - def list_proofs(): - cmd_str = ["./proofs/hol_light/arm/list_proofs.sh"] + if platform.machine().lower() in ["arm64", "aarch64"]: + arm_or_x86 = "arm" + elif platform.machine().lower() in ["x86_64"]: + arm_or_x86 = "x86" + else: + return + + def list_proofs(arm_or_x86): + cmd_str = ["./proofs/hol_light/" + arm_or_x86 + "/list_proofs.sh"] p = subprocess.run(cmd_str, capture_output=True, universal_newlines=False) proofs = filter(lambda s: s.strip() != "", p.stdout.decode().split("\n")) return list(proofs) if self.args.list_functions: - for p in list_proofs(): + for p in list_proofs(arm_or_x86): print(p) exit(0) - def run_hol_light_single_step(proofs): + def run_hol_light_single_step(proofs, arm_or_x86): num_proofs = len(proofs) for i, func in enumerate(proofs): log = logger(f"HOL_LIGHT ({i+1}/{num_proofs})", None, None, None) @@ -962,7 +969,7 @@ class Tests: start = time.time() proof_bin = f"mlkem/{func}.native" proof_target = f"mlkem/{func}.correct" - proof_dir = "proofs/hol_light/arm" + proof_dir = "proofs/hol_light/" + arm_or_x86 # Remove intermediate proof files to force-rerun try: os.remove(os.path.join(proof_dir, proof_bin)) @@ -975,7 +982,7 @@ class Tests: f"mlkem/{func}.correct", ] + self.make_j(), - cwd="proofs/hol_light/arm", + cwd="proofs/hol_light/" + arm_or_x86, env=os.environ.copy(), capture_output=(self.args.verbose is False), ) @@ -990,11 +997,11 @@ class Tests: else: log.info(f" SUCCESS (after {dur}s)") - proofs = list_proofs() + proofs = list_proofs(arm_or_x86) if self.args.proof is not None: proofs = self.args.proof - run_hol_light_single_step(proofs) + run_hol_light_single_step(proofs, arm_or_x86) self.check_fail() From f28773127eb30705ac8267921330450675567534 Mon Sep 17 00:00:00 2001 From: Hanno Becker Date: Tue, 2 Dec 2025 04:44:36 +0000 Subject: [PATCH 2/2] HOL-Light: Make dump_bytecode work in interactive shell Signed-off-by: Hanno Becker --- proofs/hol_light/arm/Makefile | 2 +- proofs/hol_light/arm/proofs/dump_bytecode.ml | 60 +++++++++---------- .../arm/proofs/keccak_f1600_x1_scalar.ml | 2 +- .../arm/proofs/keccak_f1600_x4_v8a_scalar.ml | 2 +- .../proofs/keccak_f1600_x4_v8a_v84a_scalar.ml | 2 +- .../arm/proofs/mlkem_poly_mulcache_compute.ml | 2 +- proofs/hol_light/x86/Makefile | 2 +- proofs/hol_light/x86/proofs/dump_bytecode.ml | 10 ++-- 8 files changed, 41 insertions(+), 41 deletions(-) diff --git a/proofs/hol_light/arm/Makefile b/proofs/hol_light/arm/Makefile index 2c1f2c1fcf..4adf6eccf8 100644 --- a/proofs/hol_light/arm/Makefile +++ b/proofs/hol_light/arm/Makefile @@ -144,7 +144,7 @@ run_proofs: build_proofs $(PROOF_LOGS); proofs: run_proofs ; $(SRC)/tools/count-proofs.sh . dump_bytecode: proofs/dump_bytecode.native - ./$< + cd .. ; ./arm/$< .PHONY: proofs build_proofs run_proofs sematest clean dump_bytecode diff --git a/proofs/hol_light/arm/proofs/dump_bytecode.ml b/proofs/hol_light/arm/proofs/dump_bytecode.ml index ccd1e65e17..a695fca4c4 100644 --- a/proofs/hol_light/arm/proofs/dump_bytecode.ml +++ b/proofs/hol_light/arm/proofs/dump_bytecode.ml @@ -5,62 +5,62 @@ needs "arm/proofs/base.ml";; -print_string "=== bytecode start: mlkem/keccak_f1600_x1_v84a.o ===\n";; -print_literal_from_elf "mlkem/keccak_f1600_x1_v84a.o";; +print_string "=== bytecode start: arm/mlkem/keccak_f1600_x1_v84a.o ===\n";; +print_literal_from_elf "arm/mlkem/keccak_f1600_x1_v84a.o";; print_string "==== bytecode end =====================================\n\n";; -print_string "=== bytecode start: mlkem/keccak_f1600_x1_scalar.o ===\n";; -print_literal_from_elf "mlkem/keccak_f1600_x1_scalar.o";; +print_string "=== bytecode start: arm/mlkem/keccak_f1600_x1_scalar.o ===\n";; +print_literal_from_elf "arm/mlkem/keccak_f1600_x1_scalar.o";; print_string "==== bytecode end =====================================\n\n";; -print_string "=== bytecode start: mlkem/keccak_f1600_x2_v84a.o ===\n";; -print_literal_from_elf "mlkem/keccak_f1600_x2_v84a.o";; +print_string "=== bytecode start: arm/mlkem/keccak_f1600_x2_v84a.o ===\n";; +print_literal_from_elf "arm/mlkem/keccak_f1600_x2_v84a.o";; print_string "==== bytecode end =====================================\n\n";; -print_string "=== bytecode start: mlkem/keccak_f1600_x4_v8a_scalar.o \n";; -print_literal_from_elf "mlkem/keccak_f1600_x4_v8a_scalar.o";; +print_string "=== bytecode start: arm/mlkem/keccak_f1600_x4_v8a_scalar.o \n";; +print_literal_from_elf "arm/mlkem/keccak_f1600_x4_v8a_scalar.o";; print_string "==== bytecode end =====================================\n\n";; -print_string "=== bytecode start: mlkem/keccak_f1600_x4_v8a_v84a_scalar.o ===\n";; -print_literal_from_elf "mlkem/keccak_f1600_x4_v8a_v84a_scalar.o";; +print_string "=== bytecode start: arm/mlkem/keccak_f1600_x4_v8a_v84a_scalar.o ===\n";; +print_literal_from_elf "arm/mlkem/keccak_f1600_x4_v8a_v84a_scalar.o";; print_string "==== bytecode end =====================================\n\n";; -print_string "=== bytecode start: mlkem/mlkem_intt.o ===============\n";; -print_literal_from_elf "mlkem/mlkem_intt.o";; +print_string "=== bytecode start: arm/mlkem/mlkem_intt.o ===============\n";; +print_literal_from_elf "arm/mlkem/mlkem_intt.o";; print_string "==== bytecode end =====================================\n\n";; -print_string "=== bytecode start: mlkem/mlkem_ntt.o ================\n";; -print_literal_from_elf "mlkem/mlkem_ntt.o";; +print_string "=== bytecode start: arm/mlkem/mlkem_ntt.o ================\n";; +print_literal_from_elf "arm/mlkem/mlkem_ntt.o";; print_string "==== bytecode end =====================================\n\n";; -print_string "=== bytecode start: mlkem/mlkem_poly_basemul_acc_montgomery_cached_k2.o ===\n";; -print_literal_from_elf "mlkem/mlkem_poly_basemul_acc_montgomery_cached_k2.o";; +print_string "=== bytecode start: arm/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k2.o ===\n";; +print_literal_from_elf "arm/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k2.o";; print_string "==== bytecode end =====================================\n\n";; -print_string "=== bytecode start: mlkem/mlkem_poly_basemul_acc_montgomery_cached_k3.o ===\n";; -print_literal_from_elf "mlkem/mlkem_poly_basemul_acc_montgomery_cached_k3.o";; +print_string "=== bytecode start: arm/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k3.o ===\n";; +print_literal_from_elf "arm/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k3.o";; print_string "==== bytecode end =====================================\n\n";; -print_string "=== bytecode start: mlkem/mlkem_poly_basemul_acc_montgomery_cached_k4.o ===\n";; -print_literal_from_elf "mlkem/mlkem_poly_basemul_acc_montgomery_cached_k4.o";; +print_string "=== bytecode start: arm/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k4.o ===\n";; +print_literal_from_elf "arm/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k4.o";; print_string "==== bytecode end =====================================\n\n";; -print_string "=== bytecode start: mlkem/mlkem_poly_mulcache_compute.o ===\n";; -print_literal_from_elf "mlkem/mlkem_poly_mulcache_compute.o";; +print_string "=== bytecode start: arm/mlkem/mlkem_poly_mulcache_compute.o ===\n";; +print_literal_from_elf "arm/mlkem/mlkem_poly_mulcache_compute.o";; print_string "==== bytecode end =====================================\n\n";; -print_string "=== bytecode start: mlkem/mlkem_poly_reduce.o ========\n";; -print_literal_from_elf "mlkem/mlkem_poly_reduce.o";; +print_string "=== bytecode start: arm/mlkem/mlkem_poly_reduce.o ========\n";; +print_literal_from_elf "arm/mlkem/mlkem_poly_reduce.o";; print_string "==== bytecode end =====================================\n\n";; -print_string "=== bytecode start: mlkem/mlkem_poly_tobytes.o =======\n";; -print_literal_from_elf "mlkem/mlkem_poly_tobytes.o";; +print_string "=== bytecode start: arm/mlkem/mlkem_poly_tobytes.o =======\n";; +print_literal_from_elf "arm/mlkem/mlkem_poly_tobytes.o";; print_string "==== bytecode end =====================================\n\n";; -print_string "=== bytecode start: mlkem/mlkem_poly_tomont.o ========\n";; -print_literal_from_elf "mlkem/mlkem_poly_tomont.o";; +print_string "=== bytecode start: arm/mlkem/mlkem_poly_tomont.o ========\n";; +print_literal_from_elf "arm/mlkem/mlkem_poly_tomont.o";; print_string "==== bytecode end =====================================\n\n";; -print_string "=== bytecode start: mlkem/mlkem_rej_uniform.o ========\n";; -print_literal_from_elf "mlkem/mlkem_rej_uniform.o";; +print_string "=== bytecode start: arm/mlkem/mlkem_rej_uniform.o ========\n";; +print_literal_from_elf "arm/mlkem/mlkem_rej_uniform.o";; print_string "==== bytecode end =====================================\n\n";; diff --git a/proofs/hol_light/arm/proofs/keccak_f1600_x1_scalar.ml b/proofs/hol_light/arm/proofs/keccak_f1600_x1_scalar.ml index 4a8dc6a49f..e1d9727c14 100644 --- a/proofs/hol_light/arm/proofs/keccak_f1600_x1_scalar.ml +++ b/proofs/hol_light/arm/proofs/keccak_f1600_x1_scalar.ml @@ -10,7 +10,7 @@ needs "arm/proofs/base.ml";; needs "arm/proofs/keccak_spec.ml";; -(**** print_literal_from_elf "mlkem/keccak_f1600_x1_scalar.o";; +(**** print_literal_from_elf "arm/mlkem/keccak_f1600_x1_scalar.o";; ****) let keccak_f1600_x1_scalar_mc = define_assert_from_elf diff --git a/proofs/hol_light/arm/proofs/keccak_f1600_x4_v8a_scalar.ml b/proofs/hol_light/arm/proofs/keccak_f1600_x4_v8a_scalar.ml index ecc9819d14..c22178bbf7 100644 --- a/proofs/hol_light/arm/proofs/keccak_f1600_x4_v8a_scalar.ml +++ b/proofs/hol_light/arm/proofs/keccak_f1600_x4_v8a_scalar.ml @@ -10,7 +10,7 @@ needs "arm/proofs/base.ml";; needs "arm/proofs/keccak_spec.ml";; -(**** print_literal_from_elf "mlkem/keccak_f1600_x4_v8a_scalar.o";; +(**** print_literal_from_elf "arm/mlkem/keccak_f1600_x4_v8a_scalar.o";; ****) let keccak_f1600_x4_v8a_scalar_mc = define_assert_from_elf diff --git a/proofs/hol_light/arm/proofs/keccak_f1600_x4_v8a_v84a_scalar.ml b/proofs/hol_light/arm/proofs/keccak_f1600_x4_v8a_v84a_scalar.ml index ed92f2df63..5f933bc8ea 100644 --- a/proofs/hol_light/arm/proofs/keccak_f1600_x4_v8a_v84a_scalar.ml +++ b/proofs/hol_light/arm/proofs/keccak_f1600_x4_v8a_v84a_scalar.ml @@ -10,7 +10,7 @@ needs "arm/proofs/base.ml";; needs "arm/proofs/keccak_spec.ml";; -(**** print_literal_from_elf "mlkem/keccak_f1600_x4_v8a_v84a_scalar.o";; +(**** print_literal_from_elf "arm/mlkem/keccak_f1600_x4_v8a_v84a_scalar.o";; ****) let keccak_f1600_x4_v8a_v84a_scalar_mc = define_assert_from_elf diff --git a/proofs/hol_light/arm/proofs/mlkem_poly_mulcache_compute.ml b/proofs/hol_light/arm/proofs/mlkem_poly_mulcache_compute.ml index 2d4a4edf48..1686e722cd 100644 --- a/proofs/hol_light/arm/proofs/mlkem_poly_mulcache_compute.ml +++ b/proofs/hol_light/arm/proofs/mlkem_poly_mulcache_compute.ml @@ -9,7 +9,7 @@ needs "common/mlkem_specs.ml";; needs "arm/proofs/mlkem_utils.ml";; needs "arm/proofs/mlkem_zetas.ml";; -(**** print_literal_from_elf "mlkem/poly_mulcache_compute.o";; +(**** print_literal_from_elf "arm/mlkem/poly_mulcache_compute.o";; ****) diff --git a/proofs/hol_light/x86/Makefile b/proofs/hol_light/x86/Makefile index c03b02749e..14d33a0ee5 100644 --- a/proofs/hol_light/x86/Makefile +++ b/proofs/hol_light/x86/Makefile @@ -122,7 +122,7 @@ run_proofs: build_proofs $(PROOF_LOGS); proofs: run_proofs ; $(SRC)/tools/count-proofs.sh . dump_bytecode: proofs/dump_bytecode.native - ./$< + cd .. ; ./x86/$< .PHONY: proofs build_proofs run_proofs sematest clean dump_bytecode diff --git a/proofs/hol_light/x86/proofs/dump_bytecode.ml b/proofs/hol_light/x86/proofs/dump_bytecode.ml index 8da8460c7a..f7029d94a9 100644 --- a/proofs/hol_light/x86/proofs/dump_bytecode.ml +++ b/proofs/hol_light/x86/proofs/dump_bytecode.ml @@ -5,14 +5,14 @@ needs "x86/proofs/base.ml";; -print_string "=== bytecode start: mlkem/mlkem_poly_basemul_acc_montgomery_cached_k2.o ===\n";; -print_literal_from_elf "mlkem/mlkem_poly_basemul_acc_montgomery_cached_k2.o";; +print_string "=== bytecode start: x86/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k2.o ===\n";; +print_literal_from_elf "x86/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k2.o";; print_string "==== bytecode end =====================================\n\n";; -print_string "=== bytecode start: mlkem/mlkem_poly_basemul_acc_montgomery_cached_k3.o ===\n";; -print_literal_from_elf "mlkem/mlkem_poly_basemul_acc_montgomery_cached_k3.o";; +print_string "=== bytecode start: x86/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k3.o ===\n";; +print_literal_from_elf "x86/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k3.o";; print_string "==== bytecode end =====================================\n\n";; print_string "=== bytecode start: /mlkem/mlkem_poly_basemul_acc_montgomery_cached_k4.o ===\n";; -print_literal_from_elf "mlkem/mlkem_poly_basemul_acc_montgomery_cached_k4.o";; +print_literal_from_elf "x86/mlkem/mlkem_poly_basemul_acc_montgomery_cached_k4.o";; print_string "==== bytecode end =====================================\n\n";;