Spaces:

nguyensu27
/

WaveGru_TTS

Runtime error

App Files Files Community

mrsu0994 commited on Sep 4

Commit

154f182

1 Parent(s): 3bf15e2

upload f5-tts source

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +4 -0
BUILD +44 -0
Dockerfile +34 -0
WORKSPACE +154 -0
alphabet.txt +97 -0
app.py +145 -0
bazelisk-linux-amd64 +3 -0
build_ext.sh +3 -0
extract_tacotrons_model.py +8 -0
extract_wavegru_model.py +12 -0
inference.py +90 -0
mono_tts_cbhg_small_0700000.ckpt +3 -0
packages.txt +7 -0
pooch.py +10 -0
requirements.txt +13 -0
sparse_matmul/BUILD +22 -0
sparse_matmul/compute/BUILD +88 -0
sparse_matmul/compute/ar_inputs.h +37 -0
sparse_matmul/compute/gru_gates.h +214 -0
sparse_matmul/compute/gru_gates_arm.h +288 -0
sparse_matmul/compute/gru_gates_avx_fixed.h +348 -0
sparse_matmul/compute/gru_gates_generic.h +97 -0
sparse_matmul/compute/gru_gates_test.cc +164 -0
sparse_matmul/compute/kernels_arm.h +0 -0
sparse_matmul/compute/kernels_avx.h +601 -0
sparse_matmul/compute/kernels_generic.h +273 -0
sparse_matmul/compute/matmul.h +199 -0
sparse_matmul/compute/matmul_fixed_avx2.cc +235 -0
sparse_matmul/compute/matmul_fixed_avx2.h +49 -0
sparse_matmul/compute/matmul_generic.cc +122 -0
sparse_matmul/compute/matmul_generic.h +41 -0
sparse_matmul/compute/thread_bounds.cc +106 -0
sparse_matmul/compute/thread_bounds.h +74 -0
sparse_matmul/layers/BUILD +146 -0
sparse_matmul/layers/csr_blocksparse_matrix.h +835 -0
sparse_matmul/layers/csrblocksparse_test.cc +977 -0
sparse_matmul/layers/errno_mapping.cc +195 -0
sparse_matmul/layers/errno_mapping.h +29 -0
sparse_matmul/layers/masked_sparse_matrix.h +206 -0
sparse_matmul/layers/read_array_ifstream.h +66 -0
sparse_matmul/layers/sparse_linear_layer.h +365 -0
sparse_matmul/layers/sparse_linear_layer_test.cc +187 -0
sparse_matmul/layers/status_macros.h +34 -0
sparse_matmul/layers/testdata/768_512_95_4x4_QRhat_weights.raw.gz +3 -0
sparse_matmul/layers/testdata/768_512_95_4x4_What_weights.raw.gz +3 -0
sparse_matmul/layers/testdata/768_512_95_4x4_coarselogit_bias.raw.gz +3 -0
sparse_matmul/layers/testdata/768_512_95_4x4_coarselogit_mask.raw.gz +3 -0
sparse_matmul/layers/testdata/768_512_95_4x4_coarselogit_weights.raw.gz +3 -0
sparse_matmul/layers/testdata/768_512_95_4x4_coarseproj_bias.raw.gz +3 -0
sparse_matmul/layers/testdata/768_512_95_4x4_coarseproj_mask.raw.gz +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+bazelisk-linux-amd64 filter=lfs diff=lfs merge=lfs -text

BUILD ADDED Viewed

	@@ -0,0 +1,44 @@

+# [internal] load cc_fuzz_target.bzl
+# [internal] load cc_proto_library.bzl
+# [internal] load android_cc_test:def.bzl
+load("@pybind11_bazel//:build_defs.bzl", "pybind_extension")
+package(default_visibility = [":__subpackages__"])
+licenses(["notice"])
+# To run all cc_tests in this directory:
+# bazel test //:all
+# [internal] Command to run dsp_util_android_test.
+# [internal] Command to run lyra_integration_android_test.
+exports_files(
+    srcs = [
+        "wavegru_mod.cc",
+    ],
+)
+pybind_extension(
+    name = "wavegru_mod",  # This name is not actually created!
+    srcs = ["wavegru_mod.cc"],
+    deps = [
+        "//sparse_matmul",
+    ],
+)
+py_library(
+    name = "wavegru_mod",
+    data = [":wavegru_mod.so"],
+)
+py_binary(
+    name = "wavegru",
+    srcs = ["wavegru.py"],
+    deps = [
+        ":wavegru_mod"
+    ],
+)

Dockerfile ADDED Viewed

	@@ -0,0 +1,34 @@

+# read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM us-docker.pkg.dev/colab-images/public/runtime:latest
+RUN apt update; apt install libsndfile1-dev make autoconf automake libtool gcc pkg-config -y python3-dev
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+# Set up a new user named "user" with user ID 1000
+RUN useradd -m -u 1000 user
+# Switch to the "user" user
+USER user
+# Set home to the user's home directory
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+# Set the working directory to the user's home directory
+WORKDIR $HOME/app
+# Copy the current directory contents into the container at $HOME/app setting the owner to the user
+COPY --chown=user . $HOME/app
+RUN bash ./build_ext.sh
+EXPOSE 7860
+ENV GRADIO_SERVER_NAME="0.0.0.0"
+ENTRYPOINT ["python", "app.py"]

WORKSPACE ADDED Viewed

	@@ -0,0 +1,154 @@

+########################
+# Platform Independent #
+########################
+load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository", "new_git_repository")
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+# GoogleTest/GoogleMock framework.
+git_repository(
+    name = "com_google_googletest",
+    remote = "https://github.com/google/googletest.git",
+    tag = "release-1.10.0",
+)
+# Google benchmark.
+http_archive(
+    name = "com_github_google_benchmark",
+    urls = ["https://github.com/google/benchmark/archive/bf585a2789e30585b4e3ce6baf11ef2750b54677.zip"],  # 2020-11-26T11:14:03Z
+    strip_prefix = "benchmark-bf585a2789e30585b4e3ce6baf11ef2750b54677",
+    sha256 = "2a778d821997df7d8646c9c59b8edb9a573a6e04c534c01892a40aa524a7b68c",
+)
+# proto_library, cc_proto_library, and java_proto_library rules implicitly
+# depend on @com_google_protobuf for protoc and proto runtimes.
+# This statement defines the @com_google_protobuf repo.
+git_repository(
+    name = "com_google_protobuf",
+    remote = "https://github.com/protocolbuffers/protobuf.git",
+    tag = "v3.15.4",
+)
+load("@com_google_protobuf//:protobuf_deps.bzl", "protobuf_deps")
+protobuf_deps()
+# Google Abseil Libs
+git_repository(
+    name = "com_google_absl",
+    remote = "https://github.com/abseil/abseil-cpp.git",
+    branch = "lts_2020_09_23",
+)
+# Filesystem
+# The new_* prefix is used because it is not a bazel project and there is
+# no BUILD file in that repo.
+FILESYSTEM_BUILD = """
+cc_library(
+  name = "filesystem",
+  hdrs = glob(["include/ghc/*"]),
+  visibility = ["//visibility:public"],
+)
+"""
+new_git_repository(
+    name = "gulrak_filesystem",
+    remote = "https://github.com/gulrak/filesystem.git",
+    tag = "v1.3.6",
+    build_file_content = FILESYSTEM_BUILD
+)
+# Audio DSP
+git_repository(
+    name = "com_google_audio_dsp",
+    remote = "https://github.com/google/multichannel-audio-tools.git",
+    # There are no tags for this repo, we are synced to bleeding edge.
+    branch = "master",
+    repo_mapping = {
+        "@com_github_glog_glog" : "@com_google_glog"
+    }
+)
+http_archive(
+  name = "pybind11_bazel",
+  strip_prefix = "pybind11_bazel-72cbbf1fbc830e487e3012862b7b720001b70672",
+  urls = ["https://github.com/pybind/pybind11_bazel/archive/72cbbf1fbc830e487e3012862b7b720001b70672.zip"],
+)
+# We still require the pybind library.
+http_archive(
+  name = "pybind11",
+  build_file = "@pybind11_bazel//:pybind11.BUILD",
+  strip_prefix = "pybind11-2.10.0",
+  urls = ["https://github.com/pybind/pybind11/archive/v2.10.0.tar.gz"],
+)
+load("@pybind11_bazel//:python_configure.bzl", "python_configure")
+python_configure(name = "local_config_python")
+# Transitive dependencies of Audio DSP.
+http_archive(
+    name = "eigen_archive",
+    build_file = "eigen.BUILD",
+    sha256 = "f3d69ac773ecaf3602cb940040390d4e71a501bb145ca9e01ce5464cf6d4eb68",
+    strip_prefix = "eigen-eigen-049af2f56331",
+    urls = [
+        "http://mirror.tensorflow.org/bitbucket.org/eigen/eigen/get/049af2f56331.tar.gz",
+        "https://bitbucket.org/eigen/eigen/get/049af2f56331.tar.gz",
+    ],
+)
+http_archive(
+    name = "fft2d",
+    build_file = "fft2d.BUILD",
+    sha256 = "ada7e99087c4ed477bfdf11413f2ba8db8a840ba9bbf8ac94f4f3972e2a7cec9",
+    urls = [
+        "http://www.kurims.kyoto-u.ac.jp/~ooura/fft2d.tgz",
+    ],
+)
+# Google logging
+git_repository(
+    name = "com_google_glog",
+    remote = "https://github.com/google/glog.git",
+    tag = "v0.5.0"
+)
+# Dependency for glog
+git_repository(
+    name = "com_github_gflags_gflags",
+    remote = "https://github.com/mchinen/gflags.git",
+    branch = "android_linking_fix"
+)
+# Bazel/build rules
+http_archive(
+    name = "bazel_skylib",
+    urls = [
+        "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/1.0.2/bazel-skylib-1.0.2.tar.gz",
+        "https://github.com/bazelbuild/bazel-skylib/releases/download/1.0.2/bazel-skylib-1.0.2.tar.gz",
+    ],
+    sha256 = "97e70364e9249702246c0e9444bccdc4b847bed1eb03c5a3ece4f83dfe6abc44",
+)
+load("@bazel_skylib//:workspace.bzl", "bazel_skylib_workspace")
+bazel_skylib_workspace()
+http_archive(
+    name = "rules_android",
+    sha256 = "cd06d15dd8bb59926e4d65f9003bfc20f9da4b2519985c27e190cddc8b7a7806",
+    strip_prefix = "rules_android-0.1.1",
+    urls = ["https://github.com/bazelbuild/rules_android/archive/v0.1.1.zip"],
+)
+# Google Maven Repository
+GMAVEN_TAG = "20180625-1"
+http_archive(
+    name = "gmaven_rules",
+    strip_prefix = "gmaven_rules-%s" % GMAVEN_TAG,
+    url = "https://github.com/bazelbuild/gmaven_rules/archive/%s.tar.gz" % GMAVEN_TAG,
+)
+load("@gmaven_rules//:gmaven.bzl", "gmaven_rules")
+gmaven_rules()

alphabet.txt ADDED Viewed

	@@ -0,0 +1,97 @@

+_
+■
+!
+,
+.
+:
+?
+a
+b
+c
+d
+e
+g
+h
+i
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+x
+y
+à
+á
+â
+ã
+è
+é
+ê
+ì
+í
+ò
+ó
+ô
+õ
+ù
+ú
+ý
+ă
+đ
+ĩ
+ũ
+ơ
+ư
+ạ
+ả
+ấ
+ầ
+ẩ
+ẫ
+ậ
+ắ
+ằ
+ẳ
+ẵ
+ặ
+ẹ
+ẻ
+ẽ
+ế
+ề
+ể
+ễ
+ệ
+ỉ
+ị
+ọ
+ỏ
+ố
+ồ
+ổ
+ỗ
+ộ
+ớ
+ờ
+ở
+ỡ
+ợ
+ụ
+ủ
+ứ
+ừ
+ử
+ữ
+ự
+ỳ
+ỵ
+ỷ
+ỹ

app.py ADDED Viewed

	@@ -0,0 +1,145 @@

+## build wavegru-cpp
+# import os
+# os.system("./bazelisk-linux-amd64 clean --expunge")
+# os.system("./bazelisk-linux-amd64 build wavegru_mod -c opt --copt=-march=native")
+# install espeak
+import os
+import re
+import unicodedata
+import regex
+if not os.path.isfile("./wavegru_mod.so"):
+    os.system("bash ./build_ext.sh")
+import gradio as gr
+from inference import load_tacotron_model, load_wavegru_net, mel_to_wav, text_to_mel
+from wavegru_cpp import extract_weight_mask, load_wavegru_cpp
+alphabet, tacotron_net, tacotron_config = load_tacotron_model(
+    "./alphabet.txt", "./tacotron.toml", "./mono_tts_cbhg_small_0700000.ckpt"
+)
+wavegru_config, wavegru_net = load_wavegru_net(
+    "./wavegru.yaml", "./wavegru_vocoder_tpu_gta_preemphasis_pruning_0400000.ckpt"
+)
+wave_cpp_weight_mask = extract_weight_mask(wavegru_net)
+wavecpp = load_wavegru_cpp(wave_cpp_weight_mask, wavegru_config["upsample_factors"][-1])
+space_re = regex.compile(r"\s+")
+number_re = regex.compile("([0-9]+)")
+digits = ["không", "một", "hai", "ba", "bốn", "năm", "sáu", "bảy", "tám", "chín"]
+num_re = regex.compile(r"([0-9.,]*[0-9])")
+alphabet_ = "aàáảãạăằắẳẵặâầấẩẫậeèéẻẽẹêềếểễệiìíỉĩịoòóỏõọôồốổỗộơờớởỡợuùúủũụưừứửữựyỳýỷỹỵbcdđghklmnpqrstvx"
+keep_text_and_num_re = regex.compile(rf"[^\s{alphabet_}.,0-9]")
+keep_text_re = regex.compile(rf"[^\s{alphabet_}]")
+def read_number(num: str) -> str:
+    if len(num) == 1:
+        return digits[int(num)]
+    elif len(num) == 2 and num.isdigit():
+        n = int(num)
+        end = digits[n % 10]
+        if n == 10:
+            return "mười"
+        if n % 10 == 5:
+            end = "lăm"
+        if n % 10 == 0:
+            return digits[n // 10] + " mươi"
+        elif n < 20:
+            return "mười " + end
+        else:
+            if n % 10 == 1:
+                end = "mốt"
+            return digits[n // 10] + " mươi " + end
+    elif len(num) == 3 and num.isdigit():
+        n = int(num)
+        if n % 100 == 0:
+            return digits[n // 100] + " trăm"
+        elif num[1] == "0":
+            return digits[n // 100] + " trăm lẻ " + digits[n % 100]
+        else:
+            return digits[n // 100] + " trăm " + read_number(num[1:])
+    elif len(num) >= 4 and len(num) <= 6 and num.isdigit():
+        n = int(num)
+        n1 = n // 1000
+        return read_number(str(n1)) + " ngàn " + read_number(num[-3:])
+    elif "," in num:
+        n1, n2 = num.split(",")
+        return read_number(n1) + " phẩy " + read_number(n2)
+    elif "." in num:
+        parts = num.split(".")
+        if len(parts) == 2:
+            if parts[1] == "000":
+                return read_number(parts[0]) + " ngàn"
+            elif parts[1].startswith("00"):
+                end = digits[int(parts[1][2:])]
+                return read_number(parts[0]) + " ngàn lẻ " + end
+            else:
+                return read_number(parts[0]) + " ngàn " + read_number(parts[1])
+        elif len(parts) == 3:
+            return (
+                read_number(parts[0])
+                + " triệu "
+                + read_number(parts[1])
+                + " ngàn "
+                + read_number(parts[2])
+            )
+    return num
+def normalize_text(text):
+    # lowercase
+    text = text.lower()
+    # unicode normalize
+    text = unicodedata.normalize("NFKC", text)
+    text = text.replace(".", ". ")
+    text = text.replace(",", ", ")
+    text = text.replace(";", "; ")
+    text = text.replace(":", ": ")
+    text = text.replace("!", "! ")
+    text = text.replace("?", "? ")
+    text = text.replace("(", "( ")
+    text = num_re.sub(r" \1 ", text)
+    words = text.split()
+    words = [read_number(w) if num_re.fullmatch(w) else w for w in words]
+    text = " ".join(words)
+    # remove redundant spaces
+    text = re.sub(r"\s+", " ", text)
+    # remove leading and trailing spaces
+    text = text.strip()
+    return text
+def speak(text):
+    text = normalize_text(text)
+    mel = text_to_mel(tacotron_net, text, alphabet, tacotron_config)
+    y = mel_to_wav(wavegru_net, wavecpp, mel, wavegru_config)
+    return 24_000, y
+title = "WaveGRU-TTS"
+description = "WaveGRU text-to-speech demo."
+gr.Interface(
+    fn=speak,
+    inputs="text",
+    examples=[
+        "Trăm năm trong cõi người ta, chữ tài chữ mệnh khéo là ghét nhau.",
+        "Đoạn trường tân thanh, thường được biết đến với cái tên đơn giản là Truyện Kiều, là một truyện thơ của đại thi hào Nguyễn Du.",
+        "Lục Vân Tiên quê ở huyện Đông Thành, khôi ngô tuấn tú, tài kiêm văn võ. Nghe tin triều đình mở khoa thi, Vân Tiên từ giã thầy xuống núi đua tài.",
+        "Lê Quý Đôn, tên thuở nhỏ là Lê Danh Phương, là vị quan thời Lê trung hưng, cũng là nhà thơ và được mệnh danh là nhà bác học lớn c���a Việt Nam trong thời phong kiến.",
+        "Tất cả mọi người đều sinh ra có quyền bình đẳng. Tạo hóa cho họ những quyền không ai có thể xâm phạm được, trong những quyền ấy, có quyền được sống, quyền tự do và quyền mưu cầu hạnh phúc.",
+    ],
+    outputs="audio",
+    title=title,
+    description=description,
+    theme="default",
+).launch()

bazelisk-linux-amd64 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:231ec5ca8115e94c75a1f4fbada1a062b48822ca04f21f26e4cb1cd8973cd458
+size 5152768

build_ext.sh ADDED Viewed

	@@ -0,0 +1,3 @@

+chmod +x ./bazelisk-linux-amd64
+USE_BAZEL_VERSION=5.0.0 ./bazelisk-linux-amd64 build wavegru_mod -c opt --copt=-march=native
+cp -f bazel-bin/wavegru_mod.so .

extract_tacotrons_model.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import pickle
+import jax
+dic = pickle.load(open("./mono_tts_cbhg_small_0700000.ckpt", "rb"))
+del dic["optim_state_dict"]
+dic = jax.device_get(dic)
+pickle.dump(dic, open("./mono_tts_cbhg_small_0700000.ckpt", "wb"))

extract_wavegru_model.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import pickle
+import jax
+dic = pickle.load(
+    open("./wavegru_vocoder_tpu_gta_preemphasis_pruning_0800000.ckpt", "rb")
+)
+dic = jax.device_get(dic)
+del dic["optim_state_dict"]
+pickle.dump(
+    dic, open("./wavegru_vocoder_tpu_gta_preemphasis_pruning_0800000.ckpt", "wb")
+)

inference.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import os
+import jax
+import jax.numpy as jnp
+import librosa
+import numpy as np
+import pax
+# from text import english_cleaners
+from utils import (
+    create_tacotron_model,
+    load_tacotron_ckpt,
+    load_tacotron_config,
+    load_wavegru_ckpt,
+    load_wavegru_config,
+)
+from wavegru import WaveGRU
+# os.environ["PHONEMIZER_ESPEAK_LIBRARY"] = "./espeak/usr/lib/libespeak-ng.so.1.1.51"
+# from phonemizer.backend import EspeakBackend
+# backend = EspeakBackend("en-us", preserve_punctuation=True, with_stress=True)
+def load_tacotron_model(alphabet_file, config_file, model_file):
+    """load tacotron model to memory"""
+    with open(alphabet_file, "r", encoding="utf-8") as f:
+        alphabet = f.read().split("\n")
+    config = load_tacotron_config(config_file)
+    net = create_tacotron_model(config)
+    _, net, _ = load_tacotron_ckpt(net, None, model_file)
+    net = net.eval()
+    net = jax.device_put(net)
+    return alphabet, net, config
+tacotron_inference_fn = pax.pure(lambda net, text: net.inference(text, max_len=2400))
+def text_to_mel(net, text, alphabet, config):
+    """convert text to mel spectrogram"""
+    # text = english_cleaners(text)
+    # text = backend.phonemize([text], strip=True)[0]
+    text = text + config["END_CHARACTER"]
+    text = text + config["PAD"] * (100 - (len(text) % 100))
+    tokens = []
+    for c in text:
+        if c in alphabet:
+            tokens.append(alphabet.index(c))
+    tokens = jnp.array(tokens, dtype=jnp.int32)
+    mel = tacotron_inference_fn(net, tokens[None])
+    return mel
+def load_wavegru_net(config_file, model_file):
+    """load wavegru to memory"""
+    config = load_wavegru_config(config_file)
+    net = WaveGRU(
+        mel_dim=config["mel_dim"],
+        rnn_dim=config["rnn_dim"],
+        upsample_factors=config["upsample_factors"],
+        has_linear_output=True,
+    )
+    _, net, _ = load_wavegru_ckpt(net, None, model_file)
+    net = net.eval()
+    net = jax.device_put(net)
+    return config, net
+wavegru_inference = pax.pure(lambda net, mel: net.inference(mel, no_gru=True))
+def mel_to_wav(net, netcpp, mel, config):
+    """convert mel to wav"""
+    if len(mel.shape) == 2:
+        mel = mel[None]
+    pad = config["num_pad_frames"] // 2 + 2
+    mel = np.pad(mel, [(0, 0), (pad, pad), (0, 0)], mode="edge")
+    ft = wavegru_inference(net, mel)
+    ft = jax.device_get(ft[0])
+    wav = netcpp.inference(ft, 1.0)
+    wav = np.array(wav)
+    wav = librosa.mu_expand(wav - 127, mu=255)
+    wav = librosa.effects.deemphasis(wav, coef=0.86)
+    wav = wav * 2.0
+    wav = wav / max(1.0, np.max(np.abs(wav)))
+    wav = wav * 2**15
+    wav = np.clip(wav, a_min=-(2**15), a_max=(2**15) - 1)
+    wav = wav.astype(np.int16)
+    return wav

mono_tts_cbhg_small_0700000.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:94a3cf9879f6c71ed21a6569f6f8167a8f4990e46b036b5f8196a16ea14fcb7e
+size 53480857

packages.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+libsndfile1-dev
+make
+autoconf
+automake
+libtool
+gcc
+pkg-config

pooch.py ADDED Viewed

	@@ -0,0 +1,10 @@

+def os_cache(x):
+    return x
+def create(*args, **kwargs):
+    class T:
+        def load_registry(self, *args, **kwargs):
+            return None
+    return T()

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+inflect
+jax
+jaxlib
+jinja2
+librosa==0.9.0
+numpy
+pax3 @ git+https://github.com/ntt123/pax.git
+pyyaml
+toml
+unidecode
+phonemizer
+gradio
+setuptools

sparse_matmul/BUILD ADDED Viewed

	@@ -0,0 +1,22 @@

+# [internal] load placeholder
+licenses(["notice"])
+cc_library(
+    name = "sparse_matmul",
+    hdrs = [
+        "sparse_matmul.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//sparse_matmul/compute:gru_gates",
+        "//sparse_matmul/layers:layer",
+        "//sparse_matmul/layers:matrix",
+        "//sparse_matmul/layers:utils",
+        "//sparse_matmul/numerics:fast_transcendentals",
+        "//sparse_matmul/numerics:types",
+        "//sparse_matmul/os:coop_threads",
+        "//sparse_matmul/vector:cache_aligned_vector",
+    ],  # internal :sparse_matmul deps placeholder
+)

sparse_matmul/compute/BUILD ADDED Viewed

	@@ -0,0 +1,88 @@

+# Low-level computation code, including generic and architecture-specific
+# variants.
+licenses(["notice"])
+cc_library(
+    name = "gru_gates",
+    srcs = [
+        "ar_inputs.h",
+        "gru_gates_arm.h",
+        "gru_gates_avx_fixed.h",
+        "gru_gates_generic.h",
+    ],
+    hdrs = ["gru_gates.h"],
+    visibility = [
+        "//visibility:public",
+    ],
+    deps = [
+        ":matmul",
+        "//sparse_matmul/numerics:fast_transcendentals",
+        "//sparse_matmul/numerics:types",
+        "//sparse_matmul/vector:cache_aligned_vector",
+    ],
+)
+cc_library(
+    name = "kernels",
+    srcs = [
+        "kernels_arm.h",
+        "kernels_avx.h",
+    ],
+    hdrs = [
+        "kernels_generic.h",
+    ],
+    visibility = [
+        "//sparse_matmul:__subpackages__",
+    ],
+    deps = [
+        "//sparse_matmul/numerics:fast_transcendentals",
+        "//sparse_matmul/numerics:types",
+    ],
+)
+cc_library(
+    name = "matmul",
+    srcs = [
+        "matmul_fixed_avx2.cc",
+        "matmul_fixed_avx2.h",
+        "matmul_generic.cc",
+        "matmul_generic.h",
+    ],
+    hdrs = [
+        "matmul.h",
+    ],
+    visibility = [
+        "//sparse_matmul:__subpackages__",
+    ],
+    deps = [
+        "//sparse_matmul/numerics:types",
+        "@com_google_absl//absl/time",
+    ],
+)
+cc_library(
+    name = "thread_bounds",
+    srcs = ["thread_bounds.cc"],
+    hdrs = ["thread_bounds.h"],
+    visibility = [
+        "//sparse_matmul:__subpackages__",
+    ],
+    deps = [
+        "@com_google_glog//:glog",
+    ],
+)
+cc_test(
+    name = "gru_gates_test",
+    size = "small",
+    srcs = [
+        "gru_gates_test.cc",
+    ],
+    deps = [
+        ":gru_gates",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+    ],
+)

sparse_matmul/compute/ar_inputs.h ADDED Viewed

	@@ -0,0 +1,37 @@

+/*
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef LYRA_CODEC_SPARSE_MATMUL_COMPUTE_AR_INPUTS_H_
+#define LYRA_CODEC_SPARSE_MATMUL_COMPUTE_AR_INPUTS_H_
+namespace csrblocksparse {
+// Possible numbers of Autoregressive inputs.
+// TODO(b/188702959): Generalize to any non-negative integer value?
+enum class ARInputsMode {
+  // There are no autoregressive inputs. Inputs to the GRU gates are strictly
+  // from the gate-recurrent matmul and other unrelated inputs.
+  k0ARInputs,
+  // Two autoregressive inputs, such as coarse and fine for WaveRNN.
+  k2ARInputs,
+  // Three autoregressive inputs, such as prev coarse and fine plus current
+  // coarse for WaveRNN.
+  k3ARInputs,
+};
+}  // namespace csrblocksparse
+#endif  // LYRA_CODEC_SPARSE_MATMUL_COMPUTE_AR_INPUTS_H_

sparse_matmul/compute/gru_gates.h ADDED Viewed

	@@ -0,0 +1,214 @@

+/*
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef LYRA_CODEC_SPARSE_MATMUL_COMPUTE_GRU_GATES_H_
+#define LYRA_CODEC_SPARSE_MATMUL_COMPUTE_GRU_GATES_H_
+#include <cstdint>
+#include <vector>
+// IWYU pragma: begin_exports
+#include "sparse_matmul/compute/ar_inputs.h"
+#include "sparse_matmul/compute/gru_gates_arm.h"
+#include "sparse_matmul/compute/gru_gates_avx_fixed.h"
+#include "sparse_matmul/compute/gru_gates_generic.h"
+#include "sparse_matmul/compute/matmul.h"
+#include "sparse_matmul/numerics/fixed_types.h"
+#include "sparse_matmul/numerics/type_utils.h"
+#include "sparse_matmul/vector/cache_aligned_vector.h"
+// IWYU pragma: end_exports
+namespace csrblocksparse {
+// The master template is really a catch-all for the unimplemented cases to
+// run the generics.
+template <typename GRUStateType, typename InputType, typename SampleType = void>
+class GruGates : public MatmulBase {
+ public:
+  using SampleWeightType = float;
+  static constexpr int kSIMDWidth = kGenericSIMDWidth;
+  // Generic GRU function covers all uses for WaveRNN-like architectures and
+  // conditioning.
+  // Controlled by template parameters thus:
+  // - |kInputsMode| == |k0ARInputs|: There are no autoregressive inputs so
+  //   |ar_sample0|, |ar_sample1|, |ar_sample2|, |ar_01_weights|,
+  //   |ar_2_weights| are ignored.
+  // - |kInputsMode| == |k2ARInputs|: |ar_sample0|, |ar_sample1| are multiplied
+  //   by |ar_01_weights| and added to the (conditioning) input.
+  // - |kInputsMode| == |k3ARInputs|: |ar_sample2| is multiplied by
+  //   |ar_2_weights| and added to the other two |ar_inputs| (and added to the
+  //   conditioning input).
+  // - If |kSplitGates| is true: The |*gru_recurrent_other_ptr| is secondary
+  //   recurrent input that must be added to |*gru_recurrent_ptr|.
+  // - |num_replicas| determines the number of duplicates of the output to be
+  //   written, separated by |replica_stride|.
+  // - |start|, |end| are |rows| in [0, |state_size|] to be processed by this
+  //   thread.
+  //
+  // Previous state is read from |*gru_state_ptr| and the new state is written
+  // to *(|gru_state_ptr| + i * |replica_stride| for i in [0, |num_replicas|)).
+  template <ARInputsMode kInputsMode = ARInputsMode::k2ARInputs,
+            bool kSplitGates = false>
+  void GruWithARInput(int start, int end, int state_size,
+                      const InputType* gru_recurrent_ptr,
+                      const InputType* input_ptr, GRUStateType* gru_state_ptr,
+                      const SampleType* ar_sample0 = nullptr,
+                      const SampleType* ar_sample1 = nullptr,
+                      const SampleWeightType* ar_01_weights = nullptr,
+                      int num_replicas = 1, int replica_stride = 0,
+                      const SampleType* ar_sample2 = nullptr,
+                      const SampleWeightType* ar_2_weights = nullptr,
+                      const InputType* gru_recurrent_other_ptr = nullptr) {
+    CHECK_EQ(num_replicas, 1) << "Generic code should always have 1 replica";
+    GoThroughGates<GRUStateType, InputType, SampleWeightType, SampleType,
+                   kInputsMode, kSplitGates>(
+        start, end, ar_01_weights, gru_recurrent_ptr, gru_recurrent_other_ptr,
+        input_ptr, gru_state_ptr, ar_2_weights, state_size, ar_sample0,
+        ar_sample1, ar_sample2);
+  }
+  // No AR inputs, no split gates, no batching, no replicated outputs.
+  // TODO(b/188702959): Redirect conditioning GRU here, removing code from
+  // gru_layer.h.
+  // Copy to specializations.
+  void PlainGru(int start, int end, int state_size,
+                const InputType* gru_recurrent_ptr, const InputType* input_ptr,
+                GRUStateType* gru_state_ptr) {
+    GruWithARInput<ARInputsMode::k0ARInputs>(
+        start, end, state_size, gru_recurrent_ptr, input_ptr, gru_state_ptr);
+  }
+};
+#if defined __ARM_NEON || defined __aarch64__
+// Partial specialization for float.
+template <>
+class GruGates<float, float, float> : public MatmulBase {
+ public:
+  static constexpr int kSIMDWidth = kNeonSIMDWidth;
+  // Generic GRU function covers all uses for WaveRNN-like architectures and
+  // conditioning.
+  template <ARInputsMode kInputsMode = ARInputsMode::k2ARInputs,
+            bool kSplitGates = false>
+  void GruWithARInput(int start, int end, int state_size,
+                      const float* gru_recurrent_data, const float* input_data,
+                      float* gru_state_data, const float* ar_sample0 = nullptr,
+                      const float* ar_sample1 = nullptr,
+                      const float* ar_01_weights = nullptr,
+                      int num_replicas = 1, int replica_stride = 0,
+                      const float* ar_sample2 = nullptr,
+                      const float* ar_2_weights = nullptr,
+                      const float* gru_recurrent_other_data = nullptr) {
+    DCHECK_EQ(num_replicas, 1) << "ARM code should always have 1 replica";
+    GoThroughGatesFloat<kInputsMode, kSplitGates>(
+        start, end, ar_01_weights, gru_recurrent_data, gru_recurrent_other_data,
+        input_data, gru_state_data, ar_2_weights, state_size, ar_sample0,
+        ar_sample1, ar_sample2);
+  }
+};
+#endif  // defined __ARM_NEON || defined __aarch64__
+// Partial specialization for fixed types. The sample weights are always float
+// whatever the fixed type of the other weights.
+template <int kGRUStateBits, int kInputBits, int kSampleBits>
+class GruGates<fixed16<kGRUStateBits>, fixed32<kInputBits>,
+               fixed16<kSampleBits>> : public MatmulBase {
+ public:
+#if defined __ARM_NEON || defined __aarch64__
+  static constexpr int kSIMDWidth = kNeonSIMDWidth;
+#elif defined __AVX2__
+  static constexpr int kSIMDWidth = kAVX2SIMDWidth * 2;
+#else   // Generic case.
+  static constexpr int kSIMDWidth = kGenericSIMDWidth;
+#endif  // __ARM_NEON || defined __aarch64__ / __AVX2__
+  using GRUStateType = fixed16<kGRUStateBits>;
+  using InputType = fixed32<kInputBits>;
+  using SampleType = fixed16<kSampleBits>;
+  using SampleWeightType = float;
+  static constexpr int kInputMantissaBits = InputType::kMantissaBits;
+  static constexpr int kSampleMantissaBits = SampleType::kMantissaBits;
+  static constexpr int kStateMantissaBits = GRUStateType::kMantissaBits;
+  // Generic GRU function covers all uses for WaveRNN-like architectures and
+  // conditioning.
+  template <ARInputsMode kInputsMode = ARInputsMode::k2ARInputs,
+            bool kSplitGates = false>
+  void GruWithARInput(int start, int end, int state_size,
+                      const InputType* gru_recurrent_data,
+                      const InputType* input_data, GRUStateType* gru_state_data,
+                      const SampleType* ar_sample0 = nullptr,
+                      const SampleType* ar_sample1 = nullptr,
+                      const SampleWeightType* ar_01_weights = nullptr,
+                      int num_replicas = 1, int replica_stride = 0,
+                      const SampleType* ar_sample2 = nullptr,
+                      const SampleWeightType* ar_2_weights = nullptr,
+                      const InputType* gru_recurrent_other_data = nullptr) {
+#if defined __ARM_NEON || defined __aarch64__ || defined __AVX2__
+    const int32_t* gru_recurrent_ptr =
+        reinterpret_cast<const int32_t*>(gru_recurrent_data);
+    const int32_t* gru_recurrent_other_ptr =
+        reinterpret_cast<const int32_t*>(gru_recurrent_other_data);
+    const int32_t* input_ptr = reinterpret_cast<const int32_t*>(input_data);
+    int16_t* gru_state_ptr = reinterpret_cast<int16_t*>(gru_state_data);
+#if defined __AVX2__
+    // The samples are fixed16, but we scale them up here and convert to float
+    // so that the product with the QR weights is always on the same scale as
+    // InputType, so we don't have to do any more scaling inside.
+    const float sample_factor = static_cast<float>(1 << kInputMantissaBits);
+#else
+    const float sample_factor = 1.0f;
+#endif
+    // AR sample 0 and 1 are packed into a pair because the QR weights are
+    // formatted with the weights interleaved for sample 0 and 1.
+    std::pair<float, float> ar_sample01;
+    float ar_sample2_float = 0.0f;
+    if (kInputsMode == ARInputsMode::k2ARInputs ||
+        kInputsMode == ARInputsMode::k3ARInputs) {
+      ar_sample01 = {static_cast<float>(*ar_sample0) * sample_factor,
+                     static_cast<float>(*ar_sample1) * sample_factor};
+      if (kInputsMode == ARInputsMode::k3ARInputs) {
+        ar_sample2_float = static_cast<float>(*ar_sample2) * sample_factor;
+      }
+    }
+#if defined __AVX2__
+    CHECK(using_avx2_) << "Compiled for AVX2, but cpu flag not set!";
+    GruGatesAVXFixed<kInputMantissaBits, kStateMantissaBits, kInputsMode,
+                     kSplitGates>(
+        start, end, state_size, gru_recurrent_ptr, input_ptr, &ar_sample01,
+        ar_01_weights, num_replicas, replica_stride, &ar_sample2_float,
+        ar_2_weights, gru_recurrent_other_ptr, gru_state_ptr);
+#else   // ARM.
+    DCHECK_EQ(num_replicas, 1) << "ARM code should always have 1 replica";
+    GoThroughGatesFixed<GRUStateType, InputType, kInputsMode, kSplitGates>(
+        start, end, ar_01_weights, gru_recurrent_ptr, gru_recurrent_other_ptr,
+        input_ptr, gru_state_ptr, ar_2_weights, state_size, &ar_sample01,
+        &ar_sample2_float);
+#endif  // __AVX2__ / ARM.
+#else   // Generic case.
+    CHECK_EQ(num_replicas, 1) << "Generic code should always have 1 replica";
+    GoThroughGates<GRUStateType, InputType, SampleWeightType, SampleType,
+                   kInputsMode, kSplitGates>(
+        start, end, ar_01_weights, gru_recurrent_data, gru_recurrent_other_data,
+        input_data, gru_state_data, ar_2_weights, state_size, ar_sample0,
+        ar_sample1, ar_sample2);
+#endif  // __ARM_NEON || defined __aarch64__ / __AVX2__
+  }
+};
+}  // namespace csrblocksparse
+#endif  // LYRA_CODEC_SPARSE_MATMUL_COMPUTE_GRU_GATES_H_

sparse_matmul/compute/gru_gates_arm.h ADDED Viewed

	@@ -0,0 +1,288 @@

+/*
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef LYRA_CODEC_SPARSE_MATMUL_COMPUTE_GRU_GATES_ARM_H_
+#define LYRA_CODEC_SPARSE_MATMUL_COMPUTE_GRU_GATES_ARM_H_
+#if defined __ARM_NEON || defined __aarch64__
+#include <arm_neon.h>
+#endif
+#include <cstdint>
+#include "sparse_matmul/compute/ar_inputs.h"
+#include "sparse_matmul/numerics/fast_transcendentals.h"
+namespace csrblocksparse {
+static constexpr int kNeonSIMDWidth = 4;
+// ------ Scalar calculation --------
+// See "Efficient Neural Audio Synthesis" for a description of the calculation.
+// https://arxiv.org/abs/1802.08435
+//
+// NOTE:
+// |sample| = (|coarse_at_sminus1|, |fine_at_sminus1|,
+//             |coarse_at_sminus1|, |fine_at_sminus1|)
+// |w_sample| = (|coarse_at_s|, |coarse_at_s|, |coarse_at_s|, |coarse_at_s|)
+//
+// CHEATSHEET:
+// vld1q_f32 = load 4 32-bit floats
+// vmulq_f32(a, b) : return a * b;
+// vaddq_f32(a, b) : return a + b;
+// vmlaq_f32(c, a, b) : return c + a * b;
+// vpaddq_f32(a, b) : return (a0 + a1, a2 + a3, b0 + b1, b2 + b3)
+// vsubq_f32(a, b) : return a - b;
+// vst1q_f32 = store 4 32-bit floats
+#if defined __ARM_NEON || defined __aarch64__
+#if !defined __aarch64__
+// Backport of vpaddq_f32 to ARM32.
+inline float32x4_t vpaddq_f32(float32x4_t a, float32x4_t b) {
+  float32x2_t a10 = vget_low_f32(a);
+  float32x2_t a32 = vget_high_f32(a);
+  float32x2_t b10 = vget_low_f32(b);
+  float32x2_t b32 = vget_high_f32(b);
+  return vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32));
+}
+#endif
+template <ARInputsMode kInputsMode, bool SplitGates>
+void GoThroughGatesFloat(int start, int end, const float* qr_ptr,
+                         const float* gru_gates_ptr,
+                         const float* gru_gates_other_ptr,
+                         const float* conditioning_ptr, float* gru_h_ptr,
+                         const float* w_hat, int proj_size,
+                         const float* coarse_at_sminus1,
+                         const float* fine_at_sminus1,
+                         const float* coarse_at_s) {
+  // Increment all the pointers to save on pointer arithmetic in the loop.
+  conditioning_ptr += start;
+  gru_h_ptr += start;
+  gru_gates_ptr += start;
+  if (SplitGates) {
+    DCHECK_NE(gru_gates_other_ptr, nullptr);
+    gru_gates_other_ptr += start;
+  }
+  if (kInputsMode != ARInputsMode::k0ARInputs) {
+    DCHECK_NE(qr_ptr, nullptr);
+    qr_ptr += 2 * start;
+    DCHECK_NE(coarse_at_sminus1, nullptr);
+    DCHECK_NE(fine_at_sminus1, nullptr);
+    if (kInputsMode == ARInputsMode::k3ARInputs) {
+      DCHECK_NE(w_hat, nullptr);
+      DCHECK_NE(coarse_at_s, nullptr);
+      w_hat += start;
+    }
+  }
+  for (int i = start; i < end; i += kNeonSIMDWidth) {
+    float32x4_t reset = vld1q_f32(gru_gates_ptr);
+    float32x4_t update = vld1q_f32(gru_gates_ptr + proj_size);
+    float32x4_t cell = vld1q_f32(gru_gates_ptr + 2 * proj_size);
+    float32x4_t qr_cell;
+    if (SplitGates) {
+      reset = vaddq_f32(reset, vld1q_f32(gru_gates_other_ptr));
+      update = vaddq_f32(update, vld1q_f32(gru_gates_other_ptr + proj_size));
+      cell = vaddq_f32(cell, vld1q_f32(gru_gates_other_ptr + 2 * proj_size));
+    }
+    if (kInputsMode != ARInputsMode::k0ARInputs) {
+      // Setup the sample vector.
+      float32x4_t sample = vdupq_n_f32(*coarse_at_sminus1);
+      sample = vsetq_lane_f32(*fine_at_sminus1, sample, 1);
+      sample = vsetq_lane_f32(*fine_at_sminus1, sample, 3);
+      // All auto types are float32x4_t, auto used to fit statements on one line
+      // for readability. Do two rows of QR at once.
+      auto qr_reset_0 = vmulq_f32(vld1q_f32(qr_ptr), sample);
+      auto qr_reset_1 = vmulq_f32(vld1q_f32(qr_ptr + 4), sample);
+      auto qr_reset = vpaddq_f32(qr_reset_0, qr_reset_1);
+      auto qr_update_0 = vmulq_f32(vld1q_f32(qr_ptr + 2 * proj_size), sample);
+      auto qr_update_1 =
+          vmulq_f32(vld1q_f32(qr_ptr + 4 + 2 * proj_size), sample);
+      auto qr_update = vpaddq_f32(qr_update_0, qr_update_1);
+      auto qr_cell_0 = vmulq_f32(vld1q_f32(qr_ptr + 4 * proj_size), sample);
+      auto qr_cell_1 = vmulq_f32(vld1q_f32(qr_ptr + 4 + 4 * proj_size), sample);
+      qr_cell = vpaddq_f32(qr_cell_0, qr_cell_1);
+      if (kInputsMode == ARInputsMode::k3ARInputs) {
+        float32x4_t w_sample = vdupq_n_f32(*coarse_at_s);
+        qr_reset = vmlaq_f32(qr_reset, vld1q_f32(w_hat), w_sample);
+        qr_update =
+            vmlaq_f32(qr_update, vld1q_f32(w_hat + proj_size), w_sample);
+        qr_cell =
+            vmlaq_f32(qr_cell, vld1q_f32(w_hat + 2 * proj_size), w_sample);
+      }
+      reset = vaddq_f32(reset, qr_reset);
+      update = vaddq_f32(update, qr_update);
+    }
+    auto reset_conditioning = vld1q_f32(conditioning_ptr);
+    auto update_conditioning = vld1q_f32(conditioning_ptr + proj_size);
+    auto cell_conditioning = vld1q_f32(conditioning_ptr + 2 * proj_size);
+    reset = fast_sigmoid(vaddq_f32(reset, reset_conditioning));
+    update = fast_sigmoid(vaddq_f32(update, update_conditioning));
+    if (kInputsMode == ARInputsMode::k0ARInputs) {
+      cell = vmulq_f32(reset, cell);
+    } else {
+      cell = vmlaq_f32(qr_cell, reset, cell);
+    }
+    auto hbar = fast_tanh(vaddq_f32(cell, cell_conditioning));
+    auto prev_h = vld1q_f32(gru_h_ptr);
+    auto diff = vsubq_f32(prev_h, hbar);
+    auto new_h = vmlaq_f32(hbar, diff, update);
+    vst1q_f32(gru_h_ptr, new_h);
+    // Increment all the pointers.
+    conditioning_ptr += kNeonSIMDWidth;
+    gru_h_ptr += kNeonSIMDWidth;
+    gru_gates_ptr += kNeonSIMDWidth;
+    if (SplitGates) gru_gates_other_ptr += kNeonSIMDWidth;
+    if (kInputsMode != ARInputsMode::k0ARInputs) {
+      qr_ptr += 2 * kNeonSIMDWidth;
+      if (kInputsMode == ARInputsMode::k3ARInputs) w_hat += kNeonSIMDWidth;
+    }
+  }
+}
+// This version should only be used if all of the 32-bit fixed point
+// representations have the same number of mantissa bits.
+// |ar_at_sminus1| packs sample 0 and 1 into a pair because the QR weights are
+// formatted with the weights interleaved for sample 0 and 1. The two samples
+// represent coarse and fine for WaveRNN.
+template <typename GRUStateType, typename GRUMatMulOutType,
+          ARInputsMode kInputsMode, bool SplitGates>
+void GoThroughGatesFixed(int start, int end, const float* qr_ptr,
+                         const int32_t* gru_gates_ptr,
+                         const int32_t* gru_gates_other_ptr,
+                         const int32_t* conditioning_ptr, int16_t* gru_h_ptr,
+                         const float* w_hat, int proj_size,
+                         const std::pair<float, float>* ar_at_sminus1,
+                         const float* coarse_at_s) {
+  // Increment all the pointers to save on pointer arithmetic in the loop.
+  conditioning_ptr += start;
+  gru_h_ptr += start;
+  gru_gates_ptr += start;
+  if (SplitGates) {
+    DCHECK_NE(gru_gates_other_ptr, nullptr);
+    gru_gates_other_ptr += start;
+  }
+  float32x4_t sample01;
+  float32x4_t w_sample;
+  if (kInputsMode != ARInputsMode::k0ARInputs) {
+    DCHECK_NE(qr_ptr, nullptr);
+    qr_ptr += 2 * start;
+    DCHECK_NE(ar_at_sminus1, nullptr);
+    sample01 = vdupq_n_f32(ar_at_sminus1->first);
+    sample01 = vsetq_lane_f32(ar_at_sminus1->second, sample01, 1);
+    sample01 = vsetq_lane_f32(ar_at_sminus1->second, sample01, 3);
+    if (kInputsMode == ARInputsMode::k3ARInputs) {
+      DCHECK_NE(w_hat, nullptr);
+      DCHECK_NE(coarse_at_s, nullptr);
+      w_hat += start;
+      w_sample = vdupq_n_f32(*coarse_at_s);
+    }
+  }
+  for (int i = start; i < end; i += kNeonSIMDWidth) {
+    auto reset = vld1q_s32(gru_gates_ptr);
+    auto update = vld1q_s32(gru_gates_ptr + proj_size);
+    // vcvtq_n_f32_s32 = convert 32-bit fixed point to fp32
+    auto cell_int = vld1q_s32(gru_gates_ptr + 2 * proj_size);
+    if (SplitGates) {
+      reset = vaddq_s32(reset, vld1q_s32(gru_gates_other_ptr));
+      update = vaddq_s32(update, vld1q_s32(gru_gates_other_ptr + proj_size));
+      cell_int =
+          vaddq_s32(cell_int, vld1q_s32(gru_gates_other_ptr + 2 * proj_size));
+    }
+    float32x4_t cell =
+        vcvtq_n_f32_s32(cell_int, GRUMatMulOutType::kMantissaBits);
+    float32x4_t qr_cell;
+    if (kInputsMode != ARInputsMode::k0ARInputs) {
+      // Do two rows of QR at once.
+      float32x4_t qr_reset_0 = vmulq_f32(vld1q_f32(qr_ptr), sample01);
+      float32x4_t qr_reset_1 = vmulq_f32(vld1q_f32(qr_ptr + 4), sample01);
+      float32x4_t qr_reset = vpaddq_f32(qr_reset_0, qr_reset_1);
+      float32x4_t qr_update_0 =
+          vmulq_f32(vld1q_f32(qr_ptr + 2 * proj_size), sample01);
+      float32x4_t qr_update_1 =
+          vmulq_f32(vld1q_f32(qr_ptr + 4 + 2 * proj_size), sample01);
+      float32x4_t qr_update = vpaddq_f32(qr_update_0, qr_update_1);
+      float32x4_t qr_cell_0 =
+          vmulq_f32(vld1q_f32(qr_ptr + 4 * proj_size), sample01);
+      float32x4_t qr_cell_1 =
+          vmulq_f32(vld1q_f32(qr_ptr + 4 + 4 * proj_size), sample01);
+      qr_cell = vpaddq_f32(qr_cell_0, qr_cell_1);
+      if (kInputsMode == ARInputsMode::k3ARInputs) {
+        float32x4_t w_sample = vdupq_n_f32(*coarse_at_s);
+        qr_reset = vmlaq_f32(qr_reset, vld1q_f32(w_hat), w_sample);
+        qr_update =
+            vmlaq_f32(qr_update, vld1q_f32(w_hat + proj_size), w_sample);
+        qr_cell =
+            vmlaq_f32(qr_cell, vld1q_f32(w_hat + 2 * proj_size), w_sample);
+      }
+      reset = vaddq_s32(
+          reset, vcvtq_n_s32_f32(qr_reset, GRUMatMulOutType::kMantissaBits));
+      update = vaddq_s32(
+          update, vcvtq_n_s32_f32(qr_update, GRUMatMulOutType::kMantissaBits));
+    }
+    auto reset_conditioning = vld1q_s32(conditioning_ptr);
+    auto update_conditioning = vld1q_s32(conditioning_ptr + proj_size);
+    float32x4_t cell_conditioning =
+        vcvtq_n_f32_s32(vld1q_s32(conditioning_ptr + 2 * proj_size),
+                        GRUMatMulOutType::kMantissaBits);
+    float32x4_t reset_f32 = fast_sigmoid<GRUMatMulOutType::kExponentBits>(
+        vaddq_s32(reset, reset_conditioning));
+    float32x4_t update_f32 = fast_sigmoid<GRUMatMulOutType::kExponentBits>(
+        vaddq_s32(update, update_conditioning));
+    if (kInputsMode == ARInputsMode::k0ARInputs) {
+      cell = vmulq_f32(reset_f32, cell);
+    } else {
+      cell = vmlaq_f32(qr_cell, reset_f32, cell);
+    }
+    float32x4_t hbar = fast_tanh(vaddq_f32(cell, cell_conditioning));
+    float32x4_t prev_h = vcvtq_n_f32_s32(vmovl_s16(vld1_s16(gru_h_ptr)),
+                                         GRUStateType::kMantissaBits);
+    float32x4_t diff = vsubq_f32(prev_h, hbar);
+    float32x4_t new_h = vmlaq_f32(hbar, diff, update_f32);
+    // vcvtq_n_s32_f32 = convert fp32 to signed 32-bit fixed point
+    // vqrshrn_n_s32 = saturating, rounding, narrowing right shift - used to
+    // convert a 32-bit fixed point value to a 16-bit fixed point value
+    vst1_s16(gru_h_ptr,
+             vqrshrn_n_s32(
+                 vcvtq_n_s32_f32(new_h, GRUStateType::kMantissaBits + 16), 16));
+    // Increment all the pointers.
+    conditioning_ptr += kNeonSIMDWidth;
+    gru_h_ptr += kNeonSIMDWidth;
+    gru_gates_ptr += kNeonSIMDWidth;
+    if (SplitGates) gru_gates_other_ptr += kNeonSIMDWidth;
+    if (kInputsMode != ARInputsMode::k0ARInputs) {
+      qr_ptr += 2 * kNeonSIMDWidth;
+      if (kInputsMode == ARInputsMode::k3ARInputs) w_hat += kNeonSIMDWidth;
+    }
+  }
+}
+#endif  // defined __ARM_NEON || defined __aarch64__
+}  // namespace csrblocksparse
+#endif  // LYRA_CODEC_SPARSE_MATMUL_COMPUTE_GRU_GATES_ARM_H_

sparse_matmul/compute/gru_gates_avx_fixed.h ADDED Viewed

	@@ -0,0 +1,348 @@

+/*
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef LYRA_CODEC_SPARSE_MATMUL_COMPUTE_GRU_GATES_AVX_FIXED_H_
+#define LYRA_CODEC_SPARSE_MATMUL_COMPUTE_GRU_GATES_AVX_FIXED_H_
+#include <cstdint>
+#if defined __AVX2__
+#include <immintrin.h>
+#endif
+#include <vector>
+#include "sparse_matmul/compute/ar_inputs.h"
+#include "sparse_matmul/numerics/fast_transcendentals.h"
+namespace csrblocksparse {
+#if defined __AVX2__
+constexpr int kAVX2SIMDWidth = 8;
+// Loads 8x fixed32 from |ptr0| and adds to |input|.
+// If |kTwoInputs|, also loads from |ptr1| and adds that as well.
+// Returns the 2 or 3-way sum.
+template <bool kTwoInputs>
+inline __m256i LoadAndAddFixed32(const int32_t* ptr0, const int32_t* ptr1,
+                                 const __m256i& input) {
+  __m256i data0 = _mm256_load_si256(reinterpret_cast<const __m256i*>(ptr0));
+  if (kTwoInputs) {
+    __m256i data1 = _mm256_load_si256(reinterpret_cast<const __m256i*>(ptr1));
+    data0 = _mm256_add_epi32(data0, data1);
+  }
+  return _mm256_add_epi32(data0, input);
+}
+// Loads 8x fixed32 from ptr0.
+// If |kTwoInputs|, also loads from |ptr1| and adds.
+// Multiplies the loaded values by the factor and adds to |input|, which also
+// is converted to float.
+// Returns the sum.
+template <bool kTwoInputs>
+inline __m256 LoadMultiplyAddToFloat(const int32_t* ptr0, const int32_t* ptr1,
+                                     const __m256& float_factor,
+                                     const __m256& input) {
+  __m256i data0 = _mm256_load_si256(reinterpret_cast<const __m256i*>(ptr0));
+  if (kTwoInputs) {
+    __m256i data1 = _mm256_load_si256(reinterpret_cast<const __m256i*>(ptr1));
+    data0 = _mm256_add_epi32(data0, data1);
+  }
+  __m256 float_result = _mm256_cvtepi32_ps(data0);
+  float_result = _mm256_mul_ps(float_result, float_factor);
+  return _mm256_add_ps(float_result, input);
+}
+// Loads 16x float in 2x 8x registers from |ptr0_1| and multiplies by
+// |input_pairs|, likewise formatted as 8x floats, alternating between the two
+// AR inputs and sums each pair of results, making 8x float results.
+// If |kThreeInputs|, also loads 8x float from |ptr2| and multiplies by
+// |third_input|, which must be formatted as 8x float. The second product is
+// added to the previous result.
+// Returns the sum added to |accumulator|.
+template <bool kThreeInputs>
+inline __m256 MultiplyAddFloat(const __m256& input_pairs,
+                               const __m256& third_input, const float* ptr0_1,
+                               const float* ptr2, const __m256& accumulator) {
+  __m256 data_pair0 = _mm256_load_ps(ptr0_1);
+  __m256 data_pair1 = _mm256_load_ps(ptr0_1 + 8);
+  data_pair0 = _mm256_mul_ps(data_pair0, input_pairs);
+  data_pair1 = _mm256_mul_ps(data_pair1, input_pairs);
+  data_pair0 = _mm256_hadd_ps(data_pair0, data_pair1);
+  // Swap the middle 2 64 bit pairs to correct the hadd result.
+  data_pair0 = _mm256_permute4x64_pd((__m256d)data_pair0, 0xd8);
+  if (kThreeInputs) {
+    // Load 256 bits (8 x float) of data, then multiply-accumulate.
+    data_pair1 = _mm256_load_ps(ptr2);
+    data_pair1 = _mm256_mul_ps(data_pair1, third_input);
+    data_pair0 = _mm256_add_ps(data_pair0, data_pair1);
+  }
+  // Add conditioning.
+  return _mm256_add_ps(data_pair0, accumulator);
+}
+// Processes the tanh and the final combination, returns the new GRU state.
+template <int kInputMantissaBits, int kStateMantissaBits, bool kSplitGates>
+inline __m256i GRUComputeState(const __m256& cell0, const __m256& cell1,
+                               const __m256& reset0, const __m256& reset1,
+                               const __m256& update0, const __m256& update1,
+                               const int32_t* gate_ptr,
+                               const int32_t* gate_other_ptr,
+                               const void* gru_h_ptr) {
+  // Multiply the cell gru output and the reset.
+  __m256 float_gru0 = LoadMultiplyAddToFloat<kSplitGates>(
+      gate_ptr, gate_other_ptr, reset0, cell0);
+  __m256 float_gru1 = LoadMultiplyAddToFloat<kSplitGates>(
+      gate_ptr + kAVX2SIMDWidth, gate_other_ptr + kAVX2SIMDWidth, reset1,
+      cell1);
+  // Compute tanh on the result.
+  __m256 hbar0, hbar1;
+  float_tanh_float<kInputMantissaBits, TM_ORDER4_FLOAT>(float_gru0, float_gru1,
+                                                        hbar0, hbar1);
+  // Load the 16-bit previous gru state and update.
+  __m256i gru = _mm256_load_si256(reinterpret_cast<__m256i const*>(gru_h_ptr));
+  __m256 state_factor =
+      _mm256_set1_ps(1.0f / (static_cast<float>(1 << kStateMantissaBits)));
+  float_gru0 =
+      _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(gru)));
+  float_gru1 = _mm256_cvtepi32_ps(
+      _mm256_cvtepi16_epi32(_mm256_extractf128_si256(gru, 1)));
+  float_gru0 = _mm256_mul_ps(float_gru0, state_factor);
+  float_gru1 = _mm256_mul_ps(float_gru1, state_factor);
+  float_gru0 = _mm256_sub_ps(float_gru0, hbar0);
+  float_gru1 = _mm256_sub_ps(float_gru1, hbar1);
+  float_gru0 = _mm256_mul_ps(float_gru0, update0);
+  float_gru1 = _mm256_mul_ps(float_gru1, update1);
+  state_factor = _mm256_set1_ps(static_cast<float>(1 << kStateMantissaBits));
+  float_gru0 = _mm256_add_ps(float_gru0, hbar0);
+  float_gru1 = _mm256_add_ps(float_gru1, hbar1);
+  float_gru0 = _mm256_mul_ps(float_gru0, state_factor);
+  float_gru1 = _mm256_mul_ps(float_gru1, state_factor);
+  return PackFloatsToFixed16(float_gru0, float_gru1);
+}
+// According to |kInputsMode|, processes 0, 2 or 3 autoregressive inputs and
+// combines with |input| and |gates*|.
+// With 2 AR inputs, loads 8x pairs of float from |pair_weights| and multiplies
+// by |paired_ar|, likewise formatted as 8x float, but scaled such that the
+// product with pair_weights is on the same scale as |*input| and |*gates0|,
+// and sums each pair result, making 8x float results.
+// If 3 AR inputs, also loads 8x float from |third_weights| and multiplies by
+// |third_ar|, which must be formatted as 8x scaled floats. The second product
+// is added to the previous result.
+// Inputs, 8x fixed32 are loaded from |input|, and added to the total.
+// Finally 8x fixed32 from |gates0| (and |gates1| if |kTwoGates|) are added as
+// well.
+// Returns the total sum as a float, but on the scale of |*input|.
+template <bool kTwoGates, ARInputsMode kInputsMode>
+inline __m256 GruInput32ToFloat(const __m256& paired_ar,
+                                 const __m256& third_ar,
+                                 const float* pair_weights,
+                                 const float* third_weights,
+                                 const int32_t* gates0, const int32_t* gates1,
+                                 const int32_t* input) {
+  __m256i data32 = _mm256_load_si256(reinterpret_cast<__m256i const*>(input));
+  data32 = LoadAndAddFixed32<kTwoGates>(gates0, gates1, data32);
+  __m256 float_data = _mm256_cvtepi32_ps(data32);
+  if (kInputsMode != ARInputsMode::k0ARInputs) {
+    float_data = MultiplyAddFloat<kInputsMode == ARInputsMode::k3ARInputs>(
+        paired_ar, third_ar, pair_weights, third_weights, float_data);
+  }
+  return float_data;
+}
+// Generic GRU gates function controlled by template parameters thus:
+// - |kInputBits|: the mantissa bits in |*input_ptr|, |*gru_recurrent_ptr|.
+// - |kStateBits|: the mantissa_bits in |*gru_state_ptr|.
+// - |kInputsMode == |k0ARInputs|: There are no autoregressive inputs so
+//   |ar_sample, |ar_sample1|, |ar_sample2|, |ar_01_weights|, |ar_2_weights| are
+//   ignored.
+// - |kInputsMode| == |k2ARInputs|: |ar_sample0|, |ar_sample1| are multiplied by
+//   |ar_01_weights| and added to the (conditioning) input.
+// - |kInputsMode| == |k3ARInputs|: |ar_sample2| is multiplied by |ar_2_weights|
+//   and added to the other two AR inputs (and added to the conditioning input).
+// - |kReplicas| determines the number of duplicates of the output to be
+//   written, separated by |replica_stride|. If zero, then the number of
+//   replicas is variable and taken from the |replicas| argument.
+// - If |kSplitGates| is true: The |*gru_recurrent_other_ptr| is secondary
+//   recurrent input that must be added to |*gru_recurrent_ptr|.
+// - |start|, |end| are |rows| in [0, |state_size|] to be processed by this
+//   thread.
+//
+// Previous state is read from |*gru_state_ptr| and the new state is written to
+// *(|gru_state_ptr| + i * |replica_stride| for i in [0, |kReplicas|]).
+template <int kInputBits, int kStateBits,
+          ARInputsMode kInputsMode = ARInputsMode::k0ARInputs,
+          int kReplicas = 1, bool kSplitGates = false>
+inline void GruGatesTemplate(
+    int start, int end, int state_size, int replicas, int replica_stride,
+    const int32_t* gru_recurrent_ptr, const int32_t* input_ptr,
+    const std::pair<float, float>* ar_sample01, const float* ar_01_weights,
+    const float* ar_sample2, const float* ar_2_weights,
+    const int32_t* gru_recurrent_other_ptr, int16_t* gru_state_ptr) {
+  constexpr int kQRIncrement = kAVX2SIMDWidth;
+  // Increment all the pointers to save on pointer arithmetic in the loop.
+  input_ptr += start;
+  gru_state_ptr += start;
+  gru_recurrent_ptr += start;
+  if (kSplitGates) gru_recurrent_other_ptr += start;
+  __m256 ar_2_inputs, ar_3rd_input;
+  if (kInputsMode != ARInputsMode::k0ARInputs) {
+    ar_01_weights += 2 * start;
+    ar_2_inputs = _mm256_castsi256_ps(
+        _mm256_set1_epi64x(*reinterpret_cast<const int64_t*>(ar_sample01)));
+    if (kInputsMode == ARInputsMode::k3ARInputs) {
+      ar_2_weights += start;
+      ar_3rd_input = _mm256_set1_ps(*ar_sample2);
+    } else {
+      ar_3rd_input = {};
+    }
+  } else {
+    ar_2_inputs = {};
+    ar_3rd_input = {};
+  }
+  // The transcendentals handle 2x registers of data at once, so we have to do
+  // everything in duplicate.
+  for (int i = start; i < end; i += kQRIncrement * 2) {
+    // Load 8 pairs of fixed16s for each of reset, update and cell.
+    __m256 reset0 = GruInput32ToFloat<kSplitGates, kInputsMode>(
+        ar_2_inputs, ar_3rd_input, ar_01_weights, ar_2_weights,
+        gru_recurrent_ptr, gru_recurrent_other_ptr, input_ptr);
+    __m256 reset1 = GruInput32ToFloat<kSplitGates, kInputsMode>(
+        ar_2_inputs, ar_3rd_input, ar_01_weights + 2 * kQRIncrement,
+        ar_2_weights + kQRIncrement, gru_recurrent_ptr + kAVX2SIMDWidth,
+        gru_recurrent_other_ptr + kAVX2SIMDWidth, input_ptr + kAVX2SIMDWidth);
+    float_sigmoid_float<kInputBits>(reset0, reset1);
+    __m256 update0 = GruInput32ToFloat<kSplitGates, kInputsMode>(
+        ar_2_inputs, ar_3rd_input, ar_01_weights + 2 * state_size,
+        ar_2_weights + state_size, gru_recurrent_ptr + state_size,
+        gru_recurrent_other_ptr + state_size, input_ptr + state_size);
+    __m256 update1 = GruInput32ToFloat<kSplitGates, kInputsMode>(
+        ar_2_inputs, ar_3rd_input,
+        ar_01_weights + 2 * state_size + 2 * kQRIncrement,
+        ar_2_weights + state_size + kQRIncrement,
+        gru_recurrent_ptr + state_size + kAVX2SIMDWidth,
+        gru_recurrent_other_ptr + state_size + kAVX2SIMDWidth,
+        input_ptr + state_size + kAVX2SIMDWidth);
+    float_sigmoid_float<kInputBits>(update0, update1);
+    __m256 cell0 = _mm256_cvtepi32_ps(_mm256_load_si256(
+        reinterpret_cast<__m256i const*>(input_ptr + 2 * state_size)));
+    __m256 cell1 =
+        _mm256_cvtepi32_ps(_mm256_load_si256(reinterpret_cast<__m256i const*>(
+            input_ptr + 2 * state_size + kAVX2SIMDWidth)));
+    if (kInputsMode != ARInputsMode::k0ARInputs) {
+      cell0 = MultiplyAddFloat<kInputsMode == ARInputsMode::k3ARInputs>(
+          ar_2_inputs, ar_3rd_input, ar_01_weights + 4 * state_size,
+          ar_2_weights + 2 * state_size, cell0);
+      cell1 = MultiplyAddFloat<kInputsMode == ARInputsMode::k3ARInputs>(
+          ar_2_inputs, ar_3rd_input,
+          ar_01_weights + 4 * state_size + 2 * kQRIncrement,
+          ar_2_weights + 2 * state_size + kQRIncrement, cell1);
+    }
+    __m256i gru_state = GRUComputeState<kInputBits, kStateBits, kSplitGates>(
+        cell0, cell1, reset0, reset1, update0, update1,
+        gru_recurrent_ptr + 2 * state_size,
+        gru_recurrent_other_ptr + 2 * state_size, gru_state_ptr);
+    if (kReplicas > 0) {
+      // With |kReplicas| a template parameter, the compiler will unroll the
+      // loop.
+      for (int j = 0; j < kReplicas; ++j) {
+        _mm256_store_si256(
+            reinterpret_cast<__m256i*>(gru_state_ptr + j * replica_stride),
+            gru_state);
+      }
+    } else {
+      // This loop will not unroll as replicas is variable.
+      for (int j = 0; j < replicas; ++j) {
+        _mm256_store_si256(
+            reinterpret_cast<__m256i*>(gru_state_ptr + j * replica_stride),
+            gru_state);
+      }
+    }
+    // Increment all the pointers.
+    input_ptr += 2 * kAVX2SIMDWidth;
+    gru_state_ptr += 2 * kAVX2SIMDWidth;
+    gru_recurrent_ptr += 2 * kAVX2SIMDWidth;
+    if (kSplitGates) gru_recurrent_other_ptr += 2 * kAVX2SIMDWidth;
+    if (kInputsMode != ARInputsMode::k0ARInputs) {
+      ar_01_weights += 4 * kQRIncrement;
+      if (kInputsMode == ARInputsMode::k3ARInputs)
+        ar_2_weights += 2 * kQRIncrement;
+    }
+  }
+}
+// Dispatches calls to the GruGatesTemplate function above converting the
+// replicas variable argument to a template parameter to allow the compiler to
+// unroll the write loop.
+// |ar_sample01| packs sample 0 and 1 into a pair because the QR weights are
+// formatted with the weights interleaved for sample 0 and 1. The two samples
+// represent coarse and fine for WaveRNN.
+template <int kInputBits, int kStateBits,
+          ARInputsMode kInputsMode = ARInputsMode::k2ARInputs,
+          bool kSplitGates = false>
+inline void GruGatesAVXFixed(
+    int start, int end, int state_size, const int32_t* gru_recurrent_ptr,
+    const int32_t* input_ptr, const std::pair<float, float>* ar_sample01,
+    const float* ar_01_weights, int num_replicas, int replica_stride,
+    const float* ar_sample2, const float* ar_2_weights,
+    const int32_t* gru_recurrent_other_ptr, int16_t* gru_state_ptr) {
+  // Convert the number of replicas from a variable to a template parameter
+  // with a switch. This enables the compiler to unroll the loop for
+  // the write, making it faster for common numbers of threads.
+  switch (num_replicas) {
+    case 1:
+      GruGatesTemplate<kInputBits, kStateBits, kInputsMode, /*kReplicas=*/1,
+                       kSplitGates>(
+          start, end, state_size, num_replicas, replica_stride,
+          gru_recurrent_ptr, input_ptr, ar_sample01, ar_01_weights, ar_sample2,
+          ar_2_weights, gru_recurrent_other_ptr, gru_state_ptr);
+      break;
+    case 2:
+      GruGatesTemplate<kInputBits, kStateBits, kInputsMode, /*kReplicas=*/2,
+                       kSplitGates>(
+          start, end, state_size, num_replicas, replica_stride,
+          gru_recurrent_ptr, input_ptr, ar_sample01, ar_01_weights, ar_sample2,
+          ar_2_weights, gru_recurrent_other_ptr, gru_state_ptr);
+      break;
+    case 4:
+      GruGatesTemplate<kInputBits, kStateBits, kInputsMode, /*kReplicas=*/4,
+                       kSplitGates>(
+          start, end, state_size, num_replicas, replica_stride,
+          gru_recurrent_ptr, input_ptr, ar_sample01, ar_01_weights, ar_sample2,
+          ar_2_weights, gru_recurrent_other_ptr, gru_state_ptr);
+      break;
+    case 6:
+      GruGatesTemplate<kInputBits, kStateBits, kInputsMode, /*kReplicas=*/6,
+                       kSplitGates>(
+          start, end, state_size, num_replicas, replica_stride,
+          gru_recurrent_ptr, input_ptr, ar_sample01, ar_01_weights, ar_sample2,
+          ar_2_weights, gru_recurrent_other_ptr, gru_state_ptr);
+      break;
+    default:
+      // Zero |kReplicas| tells the function to use the |num_replicas| variable.
+      GruGatesTemplate<kInputBits, kStateBits, kInputsMode, /*kReplicas=*/0,
+                       kSplitGates>(
+          start, end, state_size, num_replicas, replica_stride,
+          gru_recurrent_ptr, input_ptr, ar_sample01, ar_01_weights, ar_sample2,
+          ar_2_weights, gru_recurrent_other_ptr, gru_state_ptr);
+  }
+}
+#endif  // __AVX2__
+}  // namespace csrblocksparse
+#endif  // LYRA_CODEC_SPARSE_MATMUL_COMPUTE_GRU_GATES_AVX_FIXED_H_

sparse_matmul/compute/gru_gates_generic.h ADDED Viewed

	@@ -0,0 +1,97 @@

+/*
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef LYRA_CODEC_SPARSE_MATMUL_COMPUTE_GRU_GATES_GENERIC_H_
+#define LYRA_CODEC_SPARSE_MATMUL_COMPUTE_GRU_GATES_GENERIC_H_
+#include "sparse_matmul/compute/ar_inputs.h"
+#include "sparse_matmul/numerics/fast_transcendentals.h"
+namespace csrblocksparse {
+constexpr int kGenericSIMDWidth = 4;
+// TODO(b/188702959): Rename arguments to match gru_gates.h.
+template <typename GRUStateType, typename GRUMatMulOutType, typename QR_W_Type,
+          typename SampleType, ARInputsMode kInputsMode,
+          bool SplitGates = false>
+void GoThroughGates(int start, int end, const QR_W_Type* qr_ptr,
+                    const GRUMatMulOutType* gru_gates_ptr,
+                    const GRUMatMulOutType* gru_gates_other_ptr,
+                    const GRUMatMulOutType* conditioning_ptr,
+                    GRUStateType* gru_h_ptr, const QR_W_Type* w_hat,
+                    int proj_size, const SampleType* coarse_at_sminus1,
+                    const SampleType* fine_at_sminus1,
+                    const SampleType* coarse_at_s = nullptr) {
+  float qr_cell = 0.0f, reset, update, cell;
+  for (int i = start; i < end; ++i) {
+    if (kInputsMode == ARInputsMode::k0ARInputs) {
+      reset = static_cast<float>(gru_gates_ptr[i]);
+      update = static_cast<float>(gru_gates_ptr[proj_size + i]);
+    } else {
+      float qr_c_reset = static_cast<float>(qr_ptr[2 * i + 0]);
+      float qr_f_reset = static_cast<float>(qr_ptr[2 * i + 1]);
+      float qr_c_update = static_cast<float>(qr_ptr[2 * proj_size + 2 * i + 0]);
+      float qr_f_update = static_cast<float>(qr_ptr[2 * proj_size + 2 * i + 1]);
+      float qr_c_cell = static_cast<float>(qr_ptr[4 * proj_size + 2 * i + 0]);
+      float qr_f_cell = static_cast<float>(qr_ptr[4 * proj_size + 2 * i + 1]);
+      float w_hat_i_reset = 0.0f;
+      float w_hat_i_update = 0.0f;
+      float w_hat_i_cell = 0.0f;
+      if (kInputsMode == ARInputsMode::k3ARInputs) {
+        w_hat_i_reset = static_cast<float>(w_hat[i]);
+        w_hat_i_update = static_cast<float>(w_hat[proj_size + i]);
+        w_hat_i_cell = static_cast<float>(w_hat[2 * proj_size + i]);
+      }
+      float coarse = static_cast<float>(coarse_at_sminus1[0]);
+      float fine = static_cast<float>(fine_at_sminus1[0]);
+      reset = qr_c_reset * coarse + qr_f_reset * fine;
+      update = qr_c_update * coarse + qr_f_update * fine;
+      qr_cell = qr_c_cell * coarse + qr_f_cell * fine;
+      if (kInputsMode == ARInputsMode::k3ARInputs) {
+        float coarse = static_cast<float>(coarse_at_s[0]);
+        reset += w_hat_i_reset * coarse;
+        update += w_hat_i_update * coarse;
+        qr_cell += w_hat_i_cell * coarse;
+      }
+      reset += static_cast<float>(gru_gates_ptr[i]);
+      update += static_cast<float>(gru_gates_ptr[proj_size + i]);
+    }
+    cell = static_cast<float>(gru_gates_ptr[2 * proj_size + i]);
+    if (SplitGates) {
+      reset += static_cast<float>(gru_gates_other_ptr[i]);
+      update += static_cast<float>(gru_gates_other_ptr[proj_size + i]);
+      cell += static_cast<float>(gru_gates_other_ptr[2 * proj_size + i]);
+    }
+    float reset_conditioning = static_cast<float>(conditioning_ptr[i]);
+    float update_conditioning =
+        static_cast<float>(conditioning_ptr[proj_size + i]);
+    float cell_conditioning =
+        static_cast<float>(conditioning_ptr[2 * proj_size + i]);
+    reset = fast_sigmoid(reset + reset_conditioning);
+    update = fast_sigmoid(update + update_conditioning);
+    float hbar = fast_tanh(qr_cell + reset * cell + cell_conditioning);
+    int h_index = i;
+    float prev_h = static_cast<float>(gru_h_ptr[h_index]);
+    float diff = prev_h - hbar;
+    float new_h = hbar + diff * update;
+    gru_h_ptr[h_index] = static_cast<GRUStateType>(new_h);
+  }
+}
+}  // namespace csrblocksparse
+#endif  // LYRA_CODEC_SPARSE_MATMUL_COMPUTE_GRU_GATES_GENERIC_H_

sparse_matmul/compute/gru_gates_test.cc ADDED Viewed

	@@ -0,0 +1,164 @@

+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "sparse_matmul/compute/gru_gates.h"
+#include <cstdint>
+#include <cstring>
+#include <numeric>
+#include "absl/memory/memory.h"
+#include "absl/types/span.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+namespace {
+using csrblocksparse::ARInputsMode;
+template <typename GRUStateType, typename InputType, typename SampleType = void,
+          csrblocksparse::ARInputsMode kInputsMode, bool kSplitGates>
+csrblocksparse::CacheAlignedVector<GRUStateType> TestGruGates() {
+  using SampleWeightType = float;
+  constexpr int kStateSize = 16;
+  csrblocksparse::CacheAlignedVector<SampleWeightType> qr(6 * kStateSize);
+  csrblocksparse::CacheAlignedVector<SampleWeightType> w(3 * kStateSize);
+  csrblocksparse::CacheAlignedVector<InputType> gru_gates(3 * kStateSize);
+  csrblocksparse::CacheAlignedVector<InputType> gru_other_gates(3 * kStateSize);
+  csrblocksparse::CacheAlignedVector<InputType> conditioning(3 * kStateSize);
+  csrblocksparse::CacheAlignedVector<GRUStateType> gru_h(kStateSize);
+  csrblocksparse::GruGates<GRUStateType, InputType, SampleType> gru_gates_impl;
+  const SampleType kCoarseAtSMinus1(0.03f);
+  const SampleType kFineAtSMinus1(0.07f);
+  const SampleType kCoarseAtS(-0.02f);
+  qr.FillOnes();
+  w.FillOnes();
+  gru_gates.FillRandom();
+  gru_other_gates.FillRandom();
+  conditioning.FillRandom();
+  gru_h.FillZero();
+  gru_gates_impl.template GruWithARInput<kInputsMode, kSplitGates>(
+      /*start=*/0, /*end=*/kStateSize, kStateSize, gru_gates.data(),
+      conditioning.data(), gru_h.data(), &kCoarseAtSMinus1, &kFineAtSMinus1,
+      qr.data(),
+      /*num_replicas=*/1, /*replica_stride=*/0, &kCoarseAtS, w.data(),
+      gru_other_gates.data());
+  return gru_h;
+}
+TEST(GruGates, FloatWaveRNNCoarseMatchesGolden) {
+  // If the RNG in csrblocksparse::CacheAlignedVector changes, these numbers
+  // will also need to change.
+  const std::vector<float> kGoldenValues = {
+      0.0f, 0.0f, 0.0f,   0.0f, 1.0f, 0.746f, 0.0f, 0.0f,
+      0.0f, 0.0f, 0.970f, 0.0f, 0.0f, 1.0f,   0.0f, -0.993f};
+  csrblocksparse::CacheAlignedVector<float> gru_h =
+      TestGruGates<float, float, float, ARInputsMode::k2ARInputs,
+                   /*kSplitGates=*/true>();
+  ASSERT_EQ(kGoldenValues.size(), gru_h.size());
+  for (int i = 0; i < gru_h.size(); ++i) {
+    EXPECT_NEAR(kGoldenValues[i], gru_h[i], 1e-3) << "i=" << i;
+  }
+}
+TEST(GruGates, FloatWaveRNNFineMatchesGolden) {
+  // If the RNG in csrblocksparse::CacheAlignedVector changes, these numbers
+  // will also need to change.
+  const std::vector<float> kGoldenValues = {
+      0.0f, 0.0f, 0.0f,   0.0f, 1.0f, 0.737f, 0.0f, 0.0f,
+      0.0f, 0.0f, 0.969f, 0.0f, 0.0f, 1.0f,   0.0f, -0.994f};
+  csrblocksparse::CacheAlignedVector<float> gru_h =
+      TestGruGates<float, float, float, ARInputsMode::k3ARInputs,
+                   /*kSplitGates=*/true>();
+  ASSERT_EQ(kGoldenValues.size(), gru_h.size());
+  for (int i = 0; i < gru_h.size(); ++i) {
+    EXPECT_NEAR(kGoldenValues[i], gru_h[i], 1e-3) << "i=" << i;
+  }
+}
+TEST(GruGates, FloatTwoArInputsNonSplitGateMatchesGolden) {
+  // If the RNG in csrblocksparse::CacheAlignedVector changes, these numbers
+  // will also need to change.
+  const std::vector<float> kGoldenValues = {
+      0.0f, 0.0f, 0.0f,   0.0f, 1.0f, 0.714f, 0.0f, -0.002f,
+      0.0f, 0.0f, 0.970f, 0.0f, 0.0f, 1.0f,   0.0f, -0.965f};
+  csrblocksparse::CacheAlignedVector<float> gru_h =
+      TestGruGates<float, float, float, ARInputsMode::k2ARInputs,
+                   /*kSplitGates=*/false>();
+  ASSERT_EQ(kGoldenValues.size(), gru_h.size());
+  for (int i = 0; i < gru_h.size(); ++i) {
+    EXPECT_NEAR(kGoldenValues[i], gru_h[i], 1e-3) << "i=" << i;
+  }
+}
+TEST(GruGates, FixedWaveRNNCoarseMatchesFloat) {
+  using GRUMatMulOutType = csrblocksparse::fixed32<11>;
+  using GRUStateType = csrblocksparse::fixed16<2>;
+  using SampleType = csrblocksparse::fixed16<0>;
+  csrblocksparse::CacheAlignedVector<float> float_gru_h =
+      TestGruGates<float, float, float, ARInputsMode::k2ARInputs,
+                   /*kSplitGates=*/true>();
+  csrblocksparse::CacheAlignedVector<GRUStateType> fixed_gru_h =
+      TestGruGates<GRUStateType, GRUMatMulOutType, SampleType,
+                   ARInputsMode::k2ARInputs, /*kSplitGates=*/true>();
+  ASSERT_EQ(float_gru_h.size(), fixed_gru_h.size());
+  for (int i = 0; i < fixed_gru_h.size(); ++i) {
+    EXPECT_NEAR(float_gru_h[i], static_cast<float>(fixed_gru_h[i]), 1e-3)
+        << "i=" << i;
+  }
+}
+TEST(GruGates, FixedWaveRNNFineMatchesFloat) {
+  using GRUMatMulOutType = csrblocksparse::fixed32<11>;
+  using GRUStateType = csrblocksparse::fixed16<2>;
+  using SampleType = csrblocksparse::fixed16<0>;
+  csrblocksparse::CacheAlignedVector<float> float_gru_h =
+      TestGruGates<float, float, float, ARInputsMode::k3ARInputs,
+                   /*kSplitGates=*/true>();
+  csrblocksparse::CacheAlignedVector<GRUStateType> fixed_gru_h =
+      TestGruGates<GRUStateType, GRUMatMulOutType, SampleType,
+                   ARInputsMode::k3ARInputs, /*kSplitGates=*/true>();
+  ASSERT_EQ(float_gru_h.size(), fixed_gru_h.size());
+  for (int i = 0; i < fixed_gru_h.size(); ++i) {
+    EXPECT_NEAR(float_gru_h[i], static_cast<float>(fixed_gru_h[i]), 1e-3)
+        << "i=" << i;
+  }
+}
+TEST(GruGates, FixedTwoArInputsNonSplitGateMatchesFloat) {
+  using GRUMatMulOutType = csrblocksparse::fixed32<11>;
+  using GRUStateType = csrblocksparse::fixed16<2>;
+  using SampleType = csrblocksparse::fixed16<0>;
+  csrblocksparse::CacheAlignedVector<float> float_gru_h =
+      TestGruGates<float, float, float, ARInputsMode::k2ARInputs,
+                   /*kSplitGates=*/false>();
+  csrblocksparse::CacheAlignedVector<GRUStateType> fixed_gru_h =
+      TestGruGates<GRUStateType, GRUMatMulOutType, SampleType,
+                   ARInputsMode::k2ARInputs, /*kSplitGates=*/false>();
+  ASSERT_EQ(float_gru_h.size(), fixed_gru_h.size());
+  for (int i = 0; i < fixed_gru_h.size(); ++i) {
+    EXPECT_NEAR(float_gru_h[i], static_cast<float>(fixed_gru_h[i]), 1e-3)
+        << "i=" << i;
+  }
+}
+}  // namespace

sparse_matmul/compute/kernels_arm.h ADDED Viewed

The diff for this file is too large to render. See raw diff

sparse_matmul/compute/kernels_avx.h ADDED Viewed

	@@ -0,0 +1,601 @@

+/*
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef LYRA_CODEC_SPARSE_MATMUL_COMPUTE_KERNELS_AVX_H_
+#define LYRA_CODEC_SPARSE_MATMUL_COMPUTE_KERNELS_AVX_H_
+#if defined __AVX__
+#include <immintrin.h>
+#include <algorithm>
+#include <type_traits>
+// TODO(b/188702959): Remove fast_transcendentals with GRU refactor.
+#include "sparse_matmul/numerics/fast_transcendentals.h"
+#include "sparse_matmul/numerics/fixed_types.h"
+#include "sparse_matmul/numerics/float16_types.h"
+#include "sparse_matmul/numerics/type_utils.h"
+namespace csrblocksparse {
+namespace detail {
+template <typename WeightType, typename RhsType, typename OutType>
+struct IsAllowableFloatTypes
+    : std::integral_constant<bool, std::is_same<WeightType, float>::value &&
+                                       std::is_same<RhsType, float>::value &&
+                                       std::is_same<OutType, float>::value> {};
+#if defined __AVX2__
+// 16-bit inputs, 32-bit output exponent matches sum of input exponents
+// OR
+// 16-bit inputs, 16-bit output - will shift to match exponent
+template <typename WeightType, typename RhsType, typename OutType>
+struct IsAllowableFixedTypes
+    : std::integral_constant<bool, (IsFixed16Type<WeightType>::value &&
+                                    IsFixed16Type<RhsType>::value) &&
+                                       (IsFixed32Type<OutType>::value ||
+                                        IsFixed16Type<OutType>::value)> {};
+template <typename WeightType, typename RhsType, typename OutType>
+struct ShouldEnableGenericKernel
+    : std::integral_constant<
+          bool,
+          !IsAllowableFloatTypes<WeightType, RhsType, OutType>::value &&
+              !IsAllowableFixedTypes<WeightType, RhsType, OutType>::value> {};
+template <typename Type>
+struct IsAddableFixedTypes
+    : std::integral_constant<bool, IsFixed32Type<Type>::value ||
+                                       IsFixed16Type<Type>::value> {};
+template <typename Type>
+struct ShouldEnableGenericAdd
+    : std::integral_constant<bool, !IsAddableFixedTypes<Type>::value> {};
+#else   // No AVX2.
+template <typename WeightType, typename RhsType, typename OutType>
+struct ShouldEnableGenericKernel
+    : std::integral_constant<
+          bool, !IsAllowableFloatTypes<WeightType, RhsType, OutType>::value> {};
+template <typename Type>
+struct ShouldEnableGenericAdd : std::true_type {};
+#endif  // __AVX2__
+template <typename WeightType, typename RhsType, typename OutType>
+struct ShouldEnableGenericSpMV_4x4
+    : ShouldEnableGenericKernel<WeightType, RhsType, OutType> {};
+template <typename WeightType, typename RhsType, typename OutType>
+struct ShouldEnableGenericSpMM5_4x4
+    : ShouldEnableGenericKernel<WeightType, RhsType, OutType> {};
+template <typename WeightType, typename RhsType, typename OutType>
+struct ShouldEnableGenericSpMV_1x1 : std::true_type {};
+template <typename WeightType, typename RhsType, typename OutType>
+struct ShouldEnableGenericSpMM5_1x1 : std::true_type {};
+// The computational routines do NO error checking for speed.  It is assumed
+// that this has been handled by CSRBlockSparseMatrix.
+// In-line function to extract results from a pair of registers and store in
+// memory. Note that the non-const references are registers, and are modified
+// by this function!
+inline void Extract4Results(bool relu, __m256& sum1, __m256& sum2,
+                            float** out_ptr) {
+  // Horizontally add the results. We have 2 registers, |sum1| and |sum2| that
+  // each contain 2 sets of 4 values that need to be added.
+  sum1 = _mm256_hadd_ps(sum1, sum2);
+  sum1 = _mm256_hadd_ps(sum1, sum1);
+  // Now |sum1| contains [|res0|, |res2|, |res0|, |res2|, |res1|, |res3|,
+  // |res1|, |res3|]
+  if (relu) {
+    sum1 = _mm256_max_ps(sum1, _mm256_setzero_ps());
+  }
+  // It is really hard in AVX to cross the 128 bit 'lanes' and this is the
+  // *only* way to do it.
+  // Get the top half of |sum1| in to bottom of |sum2|.
+  sum2 = _mm256_permute2f128_ps(sum1, sum1, 1);
+  // Interleave the values between the two registers.
+  sum1 = _mm256_unpacklo_ps(sum1, sum2);
+  // Save the lower 128 bits (4 floats).
+  __m128 result = _mm256_extractf128_ps(sum1, 0);
+  _mm_store_ps(*out_ptr, result);
+  *out_ptr += 4;
+}
+// Performs the calculation y = A * x + b where A is a sparse matrix with a 4x4
+// blocked pattern, x is a vector and b is vector. Weights are stored for this
+// routine by making each 4x4 block contiguous. Blocks are ordered in standard
+// row-major format. column indices are converted to deltas and then multiplied
+// by 2 to convert to bytes, so that the value can be used directly to offset
+// the pointer into the rhs vector.
+//
+// NOTE: The bias is expected to have be multiplied by .25f prior to calling
+// this function.  This is automatically taken care of in SparseLinearLayer.
+// The bias is reconstructed through horizontal additions, leads to a small
+// speedup by reducing latencies at the end of the loop.
+template <typename WeightType, typename RhsType, typename OutType>
+typename std::enable_if<std::is_same<WeightType, float>::value &&
+                        std::is_same<RhsType, float>::value &&
+                        std::is_same<OutType, float>::value>::type
+SpMV_4x4(const WeightType* weights_ptr, const int16_t* col_deltas_bytes,
+         const int32_t* nnz_per_row, const RhsType* rhs_ptr,
+         const typename TypeOfProduct<WeightType, RhsType>::type* bias_ptr,
+         OutType* out_ptr, int64_t assigned_rows,
+         int64_t rows /* only used in SpMM variants */,
+         int64_t cols /* only used in SpMM variants */, int relu) {
+  for (int reduced_row = 0; reduced_row < assigned_rows; ++reduced_row) {
+    // Broadcast the biases by 4 to undo the division by 4 in the input biases.
+    __m256 sum1 = _mm256_set_m128(_mm_broadcast_ss(bias_ptr + 1),
+                                  _mm_broadcast_ss(bias_ptr));
+    bias_ptr += 2;
+    __m256 sum2 = _mm256_set_m128(_mm_broadcast_ss(bias_ptr + 1),
+                                  _mm_broadcast_ss(bias_ptr));
+    bias_ptr += 2;
+    int reduced_col_count = *nnz_per_row++;
+    for (int c = 0; c < reduced_col_count; ++c) {
+      int col_delta = *col_deltas_bytes++ / sizeof(RhsType);
+      rhs_ptr += col_delta;
+      // Multiply this 4x4 block.
+      __m256 rhs =
+          _mm256_broadcast_ps(reinterpret_cast<const __m128*>(rhs_ptr));
+      __m256 weights1 = _mm256_load_ps(weights_ptr);
+      weights_ptr += 8;
+      sum1 = _mm256_add_ps(sum1, _mm256_mul_ps(weights1, rhs));
+      __m256 weights2 = _mm256_load_ps(weights_ptr);
+      weights_ptr += 8;
+      sum2 = _mm256_add_ps(sum2, _mm256_mul_ps(weights2, rhs));
+    }
+    Extract4Results(relu, sum1, sum2, &out_ptr);
+  }
+}
+// Performs the calculation y = A * x + b where A is a sparse matrix with a 4x4
+// blocked pattern, x is a fat vector with 5 columns and b is vector. b is
+// broadcast. Weights are stored for this routine by making each 4x4 block
+// contiguous. Blocks are ordered in standard row-major format. column indices
+// are converted to deltas and then multiplied by 2 to convert to bytes, so
+// that the value can be used directly to offset the pointer into the rhs
+// vector.
+//
+// NOTE: The bias is expected to have be multiplied by .25f prior to calling
+// this function.  This is automatically taken care of in SparseLinearLayer.
+// The bias is reconstructed through horizontal additions, leads to a small
+// speedup by reducing latencies at the end of the loop.
+template <typename WeightType, typename RhsType, typename OutType>
+typename std::enable_if<std::is_same<WeightType, float>::value &&
+                        std::is_same<RhsType, float>::value &&
+                        std::is_same<OutType, float>::value>::type
+SpMM5_4x4(const WeightType* weights_ptr, const int16_t* col_deltas_bytes,
+          const int32_t* nnz_per_row, const RhsType* rhs_ptr,
+          const typename TypeOfProduct<WeightType, RhsType>::type* bias_ptr,
+          OutType* out_ptr, int64_t assigned_rows, int64_t rows, int64_t cols,
+          int relu) {
+  const RhsType* rhs_ptrs[5];
+  for (int i = 0; i < 5; ++i) rhs_ptrs[i] = rhs_ptr + i * cols;
+  OutType* out_ptrs[5];
+  for (int i = 0; i < 5; ++i) out_ptrs[i] = out_ptr + i * rows;
+  for (int reduced_row = 0; reduced_row < assigned_rows; ++reduced_row) {
+    // We will acumulate the results in 10 registers, |sum1_0| to |sum2_4|.
+    // Broadcast the biases by 4 to undo the division by 4 in the input biases.
+    __m256 sum1_0 = _mm256_set_m128(_mm_broadcast_ss(bias_ptr + 1),
+                                    _mm_broadcast_ss(bias_ptr));
+    bias_ptr += 2;
+    __m256 sum2_0 = _mm256_set_m128(_mm_broadcast_ss(bias_ptr + 1),
+                                    _mm_broadcast_ss(bias_ptr));
+    bias_ptr += 2;
+    __m256 sum1_1 = sum1_0;
+    __m256 sum2_1 = sum2_0;
+    __m256 sum1_2 = sum1_0;
+    __m256 sum2_2 = sum2_0;
+    __m256 sum1_3 = sum1_0;
+    __m256 sum2_3 = sum2_0;
+    __m256 sum1_4 = sum1_0;
+    __m256 sum2_4 = sum2_0;
+    int reduced_col_count = *nnz_per_row++;
+    for (int c = 0; c < reduced_col_count; ++c) {
+      int col_delta = *col_deltas_bytes++ / sizeof(RhsType);
+      for (int k = 0; k < 5; ++k) rhs_ptrs[k] += col_delta;
+      // Multiply this 4x4 block.
+      __m256 rhs =
+          _mm256_broadcast_ps(reinterpret_cast<const __m128*>(rhs_ptrs[0]));
+      __m256 weights1 = _mm256_load_ps(weights_ptr);
+      weights_ptr += 8;
+      sum1_0 = _mm256_add_ps(sum1_0, _mm256_mul_ps(weights1, rhs));
+      __m256 weights2 = _mm256_load_ps(weights_ptr);
+      weights_ptr += 8;
+      sum2_0 = _mm256_add_ps(sum2_0, _mm256_mul_ps(weights2, rhs));
+      rhs = _mm256_broadcast_ps(reinterpret_cast<const __m128*>(rhs_ptrs[1]));
+      sum1_1 = _mm256_add_ps(sum1_1, _mm256_mul_ps(weights1, rhs));
+      sum2_1 = _mm256_add_ps(sum2_1, _mm256_mul_ps(weights2, rhs));
+      rhs = _mm256_broadcast_ps(reinterpret_cast<const __m128*>(rhs_ptrs[2]));
+      sum1_2 = _mm256_add_ps(sum1_2, _mm256_mul_ps(weights1, rhs));
+      sum2_2 = _mm256_add_ps(sum2_2, _mm256_mul_ps(weights2, rhs));
+      rhs = _mm256_broadcast_ps(reinterpret_cast<const __m128*>(rhs_ptrs[3]));
+      sum1_3 = _mm256_add_ps(sum1_3, _mm256_mul_ps(weights1, rhs));
+      sum2_3 = _mm256_add_ps(sum2_3, _mm256_mul_ps(weights2, rhs));
+      rhs = _mm256_broadcast_ps(reinterpret_cast<const __m128*>(rhs_ptrs[4]));
+      sum1_4 = _mm256_add_ps(sum1_4, _mm256_mul_ps(weights1, rhs));
+      sum2_4 = _mm256_add_ps(sum2_4, _mm256_mul_ps(weights2, rhs));
+    }
+    Extract4Results(relu, sum1_0, sum2_0, &out_ptrs[0]);
+    Extract4Results(relu, sum1_1, sum2_1, &out_ptrs[1]);
+    Extract4Results(relu, sum1_2, sum2_2, &out_ptrs[2]);
+    Extract4Results(relu, sum1_3, sum2_3, &out_ptrs[3]);
+    Extract4Results(relu, sum1_4, sum2_4, &out_ptrs[4]);
+  }
+}
+#ifdef __AVX2__
+// In-line function to finish the computation of the result as 4x int32 in
+// |sum|.
+inline void Compute4Results(bool relu, int kShiftAmount, __m256i& sum) {
+  // Horizontally add the results. We have 1 register that contains results
+  // [0 0 1 1 2 2 3 3], but hadd (and almost no other AVX instruction) will not
+  // cross lanes, so we end up with [0 1 0 1 2 3 2 3]
+  sum = _mm256_hadd_epi32(sum, sum);
+  // Permutes the middle two pairs to get the answers together.
+  sum = _mm256_permute4x64_epi64(sum, 0xd8);
+  if (kShiftAmount > 0) {
+    // Shift right with rounding to get the right number of mantissa bits.
+    __m256i rounding = _mm256_set1_epi32(1 << (kShiftAmount - 1));
+    sum = _mm256_add_epi32(sum, rounding);
+    sum = _mm256_srai_epi32(sum, kShiftAmount);
+  }
+  // Now |sum| contains [|res0|, |res1|, |res2|, |res3|, |res0|, |res1|,
+  // |res2|, |res3|]
+  if (relu) {
+    sum = _mm256_max_epi32(sum, _mm256_setzero_si256());
+  }
+}
+// In-line function to extract the 4x int32 results from |sum| to memory.
+// Non-const reference for |sum| as it is a register.
+inline void Extract4xint32(bool relu, int kShiftAmount, __m256i& sum,
+                           int32_t** out_ptr) {
+  Compute4Results(relu, kShiftAmount, sum);
+  // Save the lower 128 bits (4x int32).
+  __m128i result = _mm256_extractf128_si256(sum, 0);
+  _mm_store_si128(reinterpret_cast<__m128i*>(*out_ptr), result);
+  *out_ptr += 4;
+}
+// In-line function to extract the 4x int32 results from sum to 4x int16 in
+// memory.
+// Non-const reference for |sum| as it is a register.
+inline void Extract4xint16(bool relu, int kShiftAmount, __m256i& sum,
+                           int16_t** out_ptr) {
+  Compute4Results(relu, kShiftAmount, sum);
+  // Clip to 16 bit range (with saturation) and pack in the bottom 64 bits.
+  // Converts the lower 4x int32 in bottom 128 bits to 4x int16 in bottom 64
+  // bits, replicated in the next 64 bits.
+  sum = _mm256_packs_epi32(sum, sum);
+  // Save 4x int 16 from the bottom 64 bits.
+  *reinterpret_cast<int64_t*>(*out_ptr) = _mm256_extract_epi64(sum, 0);
+  *out_ptr += 4;
+}
+// Performs the calculation y = A * x + b where A is a sparse matrix with a 4x4
+// blocked pattern, x is a vector and b is vector. Weights are stored for this
+// routine by making each 4x4 block contiguous. Blocks are ordered in standard
+// row-major format. column indices are converted to deltas and then multiplied
+// by 2 to convert to bytes, so that the value can be used directly to offset
+// the pointer into the rhs vector.
+//
+// NOTE: The bias is expected to have be multiplied by .25f prior to calling
+// this function.  This is automatically taken care of in  SparseLinearLayer.
+// The bias is reconstructed through horizontal additions, leads to a small
+// speedup by reducing latencies at the end of the loop.
+template <typename WeightType, typename RhsType, typename OutType>
+typename std::enable_if<
+    IsFixed16Type<WeightType>::value && IsFixed16Type<RhsType>::value &&
+    (IsFixed32Type<OutType>::value || IsFixed16Type<OutType>::value)>::type
+SpMV_4x4(const WeightType* weights_ptr, const int16_t* col_deltas_bytes,
+         const int32_t* nnz_per_row, const RhsType* rhs_ptr,
+         const typename TypeOfProduct<WeightType, RhsType>::type* bias_ptr,
+         OutType* out_ptr, int64_t assigned_rows,
+         int64_t rows /* only used in SpMM variants */,
+         int64_t cols /* only used in SpMM variants */, int relu) {
+  constexpr int kShiftAmount =
+      TypeOfProduct<WeightType, RhsType>::type::kMantissaBits -
+      OutType::kMantissaBits;
+  static_assert(kShiftAmount >= 0,
+                "Result must have fewer mantissa bits than product");
+  for (int reduced_row = 0; reduced_row < assigned_rows; ++reduced_row) {
+    // Load the biases duplicated into a 256 bit register [0 1 2 3 0 1 2 3].
+    __m128i bias = _mm_load_si128(reinterpret_cast<__m128i const*>(bias_ptr));
+    __m256i biases = _mm256_set_m128i(bias, bias);
+    bias_ptr += 4;
+    // Swap the top two pairs: [0 1 2 3 2 3 0 1]
+    // TODO(b/188702959): consider |_mm256_permutevar8x32|, and set the index
+    // register outside the row loop.
+    biases = _mm256_permute4x64_epi64(biases, 0xb4);
+    // Duplicate the low pairs in each lane: [0 0 1 1 2 2 3 3].
+    biases = _mm256_unpacklo_epi32(biases, biases);
+    // Double the results to make up for the division by 4.
+    // TODO(b/188702959): consider moving this to where the biases are computed.
+    __m256i sum = _mm256_add_epi32(biases, biases);
+    // TODO(b/188702959): People don't like the old-fashioned, close-to-the-
+    // metal notation of *|nnz_per_row|++, so measure the effect of putting the
+    // increment in the for loop.
+    int reduced_col_count = *nnz_per_row;
+    ++nnz_per_row;
+    for (int c = 0; c < reduced_col_count; ++c) {
+      int col_delta = *col_deltas_bytes++ / sizeof(RhsType);
+      rhs_ptr += col_delta;
+      // Multiply this 4x4 block.
+      // Get the 4x int16 into the bottom of rhs_64.
+      __m128i rhs_64 =
+          _mm_loadl_epi64(reinterpret_cast<__m128i const*>(rhs_ptr));
+      // Load all 16 weights.
+      __m256i weights =
+          _mm256_load_si256(reinterpret_cast<__m256i const*>(weights_ptr));
+      // Broadcast the rhs, pretending that each is a 64-bit unit:
+      // [0123 0123 0123 0123].
+      __m256i rhs = _mm256_broadcastq_epi64(rhs_64);
+      weights_ptr += 16;
+      // |_mm256_madd_epi16| does 16x16x16=16x32 bit multiply and horizontally
+      // adds adjacent pairs to make 8x32 bit results. Add these to the sum.
+      sum = _mm256_add_epi32(sum, _mm256_madd_epi16(weights, rhs));
+    }
+    static_assert(
+        IsFixed16Type<OutType>::value || IsFixed32Type<OutType>::value,
+        "AVX2 kernel only supports fixed16 and fixed32 types");
+    // The only significant difference between fixed16 and fixed32 is the size
+    // of the storage unit. The registers have to be repacked accordingly.
+    if (IsFixed32Type<OutType>::value) {
+      Extract4xint32(relu, kShiftAmount, sum,
+                     reinterpret_cast<int32_t**>(&out_ptr));
+    } else {
+      Extract4xint16(relu, kShiftAmount, sum,
+                     reinterpret_cast<int16_t**>(&out_ptr));
+    }
+  }
+}
+// Performs the calculation y = A * x + b where A is a sparse matrix with a 4x4
+// blocked pattern, x is a fat vector with 5 columns and b is vector. b is
+// broadcast. Weights are stored for this routine by making each 4x4 block
+// contiguous. Blocks are ordered in standard row-major format. column indices
+// are converted to deltas and then multiplied by 2 to convert to bytes, so
+// that the value can be used directly to offset the pointer into the rhs
+// vector.
+//
+// NOTE: The bias is expected to have be multiplied by .25f prior to calling
+// this function.  This is automatically taken care of in SparseLinearLayer.
+// The bias is reconstructed through horizontal additions, leads to a small
+// speedup by reducing latencies at the end of the loop.
+template <typename WeightType, typename RhsType, typename OutType>
+typename std::enable_if<
+    IsFixed16Type<WeightType>::value && IsFixed16Type<RhsType>::value &&
+    (IsFixed32Type<OutType>::value || IsFixed16Type<OutType>::value)>::type
+SpMM5_4x4(const WeightType* weights_ptr, const int16_t* col_deltas_bytes,
+          const int32_t* nnz_per_row, const RhsType* rhs_ptr,
+          const typename TypeOfProduct<WeightType, RhsType>::type* bias_ptr,
+          OutType* out_ptr, int64_t assigned_rows, int64_t rows, int64_t cols,
+          int relu) {
+  constexpr int kShiftAmount =
+      TypeOfProduct<WeightType, RhsType>::type::kMantissaBits -
+      OutType::kMantissaBits;
+  static_assert(kShiftAmount >= 0,
+                "Result must have fewer mantissa bits than product");
+  const RhsType* rhs_ptrs[5];
+  for (int i = 0; i < 5; ++i) rhs_ptrs[i] = rhs_ptr + i * cols;
+  OutType* out_ptrs[5];
+  for (int i = 0; i < 5; ++i) out_ptrs[i] = out_ptr + i * rows;
+  for (int reduced_row = 0; reduced_row < assigned_rows; ++reduced_row) {
+    // We will acumulate the results in 5 registers, sum_0 to sum_4.
+    // Load the biases duplicated into a 256 bit register [0 1 2 3 0 1 2 3].
+    __m128i bias = _mm_load_si128(reinterpret_cast<__m128i const*>(bias_ptr));
+    __m256i biases = _mm256_set_m128i(bias, bias);
+    bias_ptr += 4;
+    // Swap the top two pairs: [0 1 2 3 2 3 0 1]
+    biases = _mm256_permute4x64_epi64(biases, 0xb4);
+    // Duplicate the low pairs in each lane: [0 0 1 1 2 2 3 3].
+    biases = _mm256_unpacklo_epi32(biases, biases);
+    // Double the results to make up for the division by 4.
+    __m256i sum_0 = _mm256_add_epi32(biases, biases);
+    __m256i sum_1 = sum_0;
+    __m256i sum_2 = sum_0;
+    __m256i sum_3 = sum_0;
+    __m256i sum_4 = sum_0;
+    int reduced_col_count = *nnz_per_row;
+    ++nnz_per_row;
+    for (int c = 0; c < reduced_col_count; ++c) {
+      int col_delta = *col_deltas_bytes++ / sizeof(RhsType);
+      for (int k = 0; k < 5; ++k) rhs_ptrs[k] += col_delta;
+      // Multiply this 4x4 block.
+      // Get the 4x int16 into the bottom of |rhs_64|.
+      __m128i rhs_64 =
+          _mm_loadl_epi64(reinterpret_cast<__m128i const*>(rhs_ptrs[0]));
+      // Load all 16 weights.
+      __m256i weights =
+          _mm256_load_si256(reinterpret_cast<__m256i const*>(weights_ptr));
+      // Broadcast the rhs, pretending that each is a 64-bit unit:
+      // [0123 0123 0123 0123].
+      __m256i rhs = _mm256_broadcastq_epi64(rhs_64);
+      weights_ptr += 16;
+      // |_mm256_madd_epi16| does 16x16x16=16x32 bit multiply and horizontally
+      // adds adjacent pairs to make 8x32 bit results. Add these to the sum.
+      sum_0 = _mm256_add_epi32(sum_0, _mm256_madd_epi16(weights, rhs));
+      rhs_64 = _mm_loadl_epi64(reinterpret_cast<__m128i const*>(rhs_ptrs[1]));
+      rhs = _mm256_broadcastq_epi64(rhs_64);
+      sum_1 = _mm256_add_epi32(sum_1, _mm256_madd_epi16(weights, rhs));
+      rhs_64 = _mm_loadl_epi64(reinterpret_cast<__m128i const*>(rhs_ptrs[2]));
+      rhs = _mm256_broadcastq_epi64(rhs_64);
+      sum_2 = _mm256_add_epi32(sum_2, _mm256_madd_epi16(weights, rhs));
+      rhs_64 = _mm_loadl_epi64(reinterpret_cast<__m128i const*>(rhs_ptrs[3]));
+      rhs = _mm256_broadcastq_epi64(rhs_64);
+      sum_3 = _mm256_add_epi32(sum_3, _mm256_madd_epi16(weights, rhs));
+      rhs_64 = _mm_loadl_epi64(reinterpret_cast<__m128i const*>(rhs_ptrs[4]));
+      rhs = _mm256_broadcastq_epi64(rhs_64);
+      sum_4 = _mm256_add_epi32(sum_4, _mm256_madd_epi16(weights, rhs));
+    }
+    static_assert(
+        IsFixed16Type<OutType>::value || IsFixed32Type<OutType>::value,
+        "AVX2 kernel only supports fixed16 and fixed32 types");
+    // The only significant difference between fixed16 and fixed32 is the size
+    // of the storage unit. The registers have to be repacked accordingly.
+    if (IsFixed32Type<OutType>::value) {
+      Extract4xint32(relu, kShiftAmount, sum_0,
+                     reinterpret_cast<int32_t**>(&out_ptrs[0]));
+      Extract4xint32(relu, kShiftAmount, sum_1,
+                     reinterpret_cast<int32_t**>(&out_ptrs[1]));
+      Extract4xint32(relu, kShiftAmount, sum_2,
+                     reinterpret_cast<int32_t**>(&out_ptrs[2]));
+      Extract4xint32(relu, kShiftAmount, sum_3,
+                     reinterpret_cast<int32_t**>(&out_ptrs[3]));
+      Extract4xint32(relu, kShiftAmount, sum_4,
+                     reinterpret_cast<int32_t**>(&out_ptrs[4]));
+    } else {
+      Extract4xint16(relu, kShiftAmount, sum_0,
+                     reinterpret_cast<int16_t**>(&out_ptrs[0]));
+      Extract4xint16(relu, kShiftAmount, sum_1,
+                     reinterpret_cast<int16_t**>(&out_ptrs[1]));
+      Extract4xint16(relu, kShiftAmount, sum_2,
+                     reinterpret_cast<int16_t**>(&out_ptrs[2]));
+      Extract4xint16(relu, kShiftAmount, sum_3,
+                     reinterpret_cast<int16_t**>(&out_ptrs[3]));
+      Extract4xint16(relu, kShiftAmount, sum_4,
+                     reinterpret_cast<int16_t**>(&out_ptrs[4]));
+    }
+  }
+}
+// Processes one GRU gate input with sigmoid.
+template <int InputMantissaBits, int StateMantissaBits, bool SplitGates>
+inline __m256i GRUGateSigmoid(const void* gate_ptr, const void* gate_other_ptr,
+                              const __m256i& input,
+                              const int32_t* sigmoid_table) {
+  __m256i gate = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(gate_ptr));
+  if (SplitGates) {
+    __m256i other =
+        _mm256_loadu_si256(reinterpret_cast<const __m256i*>(gate_other_ptr));
+    gate = _mm256_add_epi32(gate, other);
+  }
+  gate = _mm256_add_epi32(gate, input);
+  // Compute sigmoids on reset and update.
+  return csrblocksparse::fixed32_sigmoid_fixed16<InputMantissaBits,
+                                                 StateMantissaBits>(
+      sigmoid_table, gate);
+}
+// Processes the tanh and the final combination, returning the new GRU state.
+template <int InputMantissaBits, int StateMantissaBits, bool SplitGates = false>
+inline __m256i GRUGateState(const __m256i& cell, const __m256i& reset,
+                            const __m256i& update,
+                            const __m256i& rounding_offset,
+                            const void* gate_ptr, const void* gate_other_ptr,
+                            const void* gru_h_ptr, const int32_t* tanh_table) {
+  // Multiply the cell GRU output and the reset. There is a slight danger of
+  // loss of precision here, so use 32x32=64 bit and shift back after.
+  __m256i gru = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(gate_ptr));
+  if (SplitGates) {
+    __m256i other_gru =
+        _mm256_loadu_si256(reinterpret_cast<__m256i const*>(gate_other_ptr));
+    gru = _mm256_add_epi32(gru, other_gru);
+  }
+  // This only computes the products of the low-order 32 bits of each pair.
+  __m256i gru_lo = _mm256_mul_epi32(gru, reset);
+  // Swap odd and even 32-bit units and do it again to get the high products.
+  gru = _mm256_shuffle_epi32(gru, 0xb1);
+  __m256i gru_hi = _mm256_mul_epi32(gru, _mm256_shuffle_epi32(reset, 0xb1));
+  // Now shift right to compensate for the multiply and re-interleave the
+  // 32-bit results.
+  // NOTE: There is no shift right arithmetic for 64 bit values until AVX512!
+  // Fortunately it doesn't matter, as the results are being truncated to 32
+  // bits and we aren't shifting right by more than 32 bits here.
+  gru_lo = _mm256_srli_epi64(gru_lo, StateMantissaBits);
+  // The upper results are shifted LEFT, so we can use blend to recombine in
+  // a single instruction.
+  gru_hi = _mm256_slli_epi64(gru_hi, 32 - StateMantissaBits);
+  // Recombine the 32 bit results from lo and hi, alternating.
+  gru = _mm256_blend_epi32(gru_lo, gru_hi, 0xaa);
+  gru = _mm256_add_epi32(cell, gru);
+  // Compute tanh on the result. Although this instantly discards a bunch of
+  // bits, there were only 7 surplus bits for the multiply, which isn't enough
+  // to do it as 16x16=32.
+  __m256i hbar =
+      csrblocksparse::fixed32_tanh_fixed16<InputMantissaBits,
+                                           StateMantissaBits>(tanh_table, gru);
+  // Load the 16-bit previous GRU state and sign-extend to 32 bits.
+  gru = _mm256_cvtepi16_epi32(
+      _mm_load_si128(reinterpret_cast<__m128i const*>(gru_h_ptr)));
+  gru = _mm256_sub_epi32(gru, hbar);
+  // Since |gru| is 16 bit sign-extended to 32, and |update| is the output of
+  // sigmoid, it is always contained within 16 bits and never negative, we can
+  // use |madd_epi16| to do 16x16=32 multiply with horizontal adding as the
+  // addend will always be zero, and this is twice as fast as full blown
+  // 32x32=32. The only possible problem is if the subtract above caused
+  // overflow.
+  gru = _mm256_madd_epi16(gru, update);
+  // Renormalize to fixed16. This time rounding is critical, as this is the
+  // output GRU state.
+  gru = _mm256_add_epi32(gru, rounding_offset);
+  gru = _mm256_srai_epi32(gru, StateMantissaBits);
+  return _mm256_add_epi32(gru, hbar);
+}
+template <typename Type>
+typename std::enable_if<IsFixed32Type<Type>::value>::type SumVectors(
+    int start, int end, const Type* add1, const Type* add2, Type* result) {
+  constexpr int kSIMDWidth = 8;
+  for (int i = start; i < end; i += kSIMDWidth) {
+    __m256i data1 =
+        _mm256_load_si256(reinterpret_cast<__m256i const*>(add1 + i));
+    __m256i data2 =
+        _mm256_load_si256(reinterpret_cast<__m256i const*>(add2 + i));
+    data1 = _mm256_add_epi32(data1, data2);
+    _mm256_store_si256(reinterpret_cast<__m256i*>(result + i), data1);
+  }
+}
+template <typename Type>
+typename std::enable_if<IsFixed16Type<Type>::value>::type SumVectors(
+    int start, int end, const Type* add1, const Type* add2, Type* result) {
+  constexpr int kSIMDWidth = 16;
+  for (int i = start; i < end; i += kSIMDWidth) {
+    __m256i data1 =
+        _mm256_load_si256(reinterpret_cast<__m256i const*>(add1 + i));
+    __m256i data2 =
+        _mm256_load_si256(reinterpret_cast<__m256i const*>(add2 + i));
+    data1 = _mm256_add_epi16(data1, data2);
+    _mm256_store_si256(reinterpret_cast<__m256i*>(result + i), data1);
+  }
+}
+#endif  // __AVX2__
+}  // namespace detail
+}  // namespace csrblocksparse
+#undef LABEL_COL_LOOP
+#undef LABEL_ROW_LOOP
+#undef LABEL_SKIP_COL_LOOP
+#undef LABEL_TOP_LOOP
+#endif  // __AVX__
+#endif  // LYRA_CODEC_SPARSE_MATMUL_COMPUTE_KERNELS_AVX_H_

sparse_matmul/compute/kernels_generic.h ADDED Viewed

	@@ -0,0 +1,273 @@

+/*
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef LYRA_CODEC_SPARSE_MATMUL_COMPUTE_KERNELS_GENERIC_H_
+#define LYRA_CODEC_SPARSE_MATMUL_COMPUTE_KERNELS_GENERIC_H_
+#include <algorithm>
+#include <type_traits>
+#include "sparse_matmul/numerics/fixed_types.h"
+#include "sparse_matmul/numerics/float16_types.h"
+#include "sparse_matmul/numerics/type_utils.h"
+// Separate out the assembly kernels for readability. Eventually this will
+// become an ifdef switch on the architecture type.
+#if defined __aarch64__
+#include "sparse_matmul/compute/kernels_arm.h"
+#elif defined __AVX__
+#include "sparse_matmul/compute/kernels_avx.h"
+#else   // defined __AVX__
+// If there is no architecture-specific implementation, then always use generic.
+template <typename WeightType, typename RhsType, typename OutType>
+struct ShouldEnableGenericSpMV_4x4 : std::true_type {};
+template <typename WeightType, typename RhsType, typename OutType>
+struct ShouldEnableGenericSpMM5_4x4 : std::true_type {};
+template <typename WeightType, typename RhsType, typename OutType>
+struct ShouldEnableGenericSpMV_1x1 : std::true_type {};
+template <typename WeightType, typename RhsType, typename OutType>
+struct ShouldEnableGenericSpMM5_1x1 : std::true_type {};
+template <typename Type>
+struct ShouldEnableGenericAdd : std::true_type {};
+#endif  // defined __arch64__
+namespace csrblocksparse {
+namespace detail {
+// The computational routines do NO error checking for speed.  It is assumed
+// that this has been handled by CSRBlockSparseMatrix.
+// Performs the calculation y = A * x + b where A is a sparse matrix with a 4x4
+// blocked pattern, x is a vector and b is vector. Weights are stored for this
+// routine by making each 4x4 block contiguous. Blocks are ordered in standard
+// row-major format. column indices are converted to deltas and then multiplied
+// by 2 to convert to bytes, so that the value can be used directly to offset
+// the pointer into the rhs vector.
+//
+// NOTE: The bias is expected to have be multiplied by .25f prior to calling
+// this function.  This is automatically taken care of in SparseLinearLayer.
+// The bias is reconstructed through horizontal additions, leads to a small
+// speedup by reducing latencies at the end of the loop.
+template <typename WeightType, typename RhsType, typename OutType>
+typename std::enable_if<
+    ShouldEnableGenericSpMV_4x4<WeightType, RhsType, OutType>::value>::type
+SpMV_4x4(const WeightType* weights_ptr, const int16_t* col_deltas_bytes,
+         const int32_t* nnz_per_row, const RhsType* rhs_ptr,
+         const typename TypeOfProduct<WeightType, RhsType>::type* bias_ptr,
+         OutType* out_ptr, int64_t assigned_rows,
+         int64_t rows /* only used in SpMM variants */,
+         int64_t cols /* only used in SpMM variants */, int relu) {
+  for (int reduced_row = 0; reduced_row < assigned_rows; ++reduced_row) {
+    float accumulators[4];
+    // Undo the divion by the happens for the assembly version.
+    for (int i = 0; i < 4; ++i)
+      accumulators[i] = 4.f * static_cast<float>(*bias_ptr++);
+    int reduced_col_count = *nnz_per_row++;
+    for (int c = 0; c < reduced_col_count; ++c) {
+      int col_delta = *col_deltas_bytes++ / sizeof(RhsType);
+      rhs_ptr += col_delta;
+      // Multiply this 4x4 block.
+      for (int i = 0; i < 4; ++i) {
+        for (int j = 0; j < 4; ++j) {
+          accumulators[i] += static_cast<float>(*weights_ptr++) *
+                             static_cast<float>(rhs_ptr[j]);
+        }
+      }
+    }
+    for (int i = 0; i < 4; ++i)
+      *out_ptr++ = static_cast<OutType>(relu ? std::max(accumulators[i], 0.f)
+                                             : accumulators[i]);
+  }
+}
+// Performs the calculation y = A * x + b where A is a sparse matrix with a 4x4
+// blocked pattern, x is a fat vector with 5 columns and b is vector. b is
+// broadcast. Weights are stored for this routine by making each 4x4 block
+// contiguous. Blocks are ordered in standard row-major format. column indices
+// are converted to deltas and then multiplied by 2 to convert to bytes, so
+// that the value can be used directly to offset the pointer into the rhs
+// vector.
+//
+// NOTE: The bias is expected to have be multiplied by .25f prior to calling
+// this function.  This is automatically taken care of in SparseLinearLayer.
+// The bias is reconstructed through horizontal additions, leads to a small
+// speedup by reducing latencies at the end of the loop.
+template <typename WeightType, typename RhsType, typename OutType>
+typename std::enable_if<
+    ShouldEnableGenericSpMM5_4x4<WeightType, RhsType, OutType>::value>::type
+SpMM5_4x4(const WeightType* weights_ptr, const int16_t* col_deltas_bytes,
+          const int32_t* nnz_per_row, const RhsType* rhs_ptr,
+          const typename TypeOfProduct<WeightType, RhsType>::type* bias_ptr,
+          OutType* out_ptr, int64_t assigned_rows, int64_t rows, int64_t cols,
+          int relu) {
+  const RhsType* rhs_ptrs[5];
+  for (int i = 0; i < 5; ++i) rhs_ptrs[i] = rhs_ptr + i * cols;
+  OutType* out_ptrs[5];
+  for (int i = 0; i < 5; ++i) out_ptrs[i] = out_ptr + i * rows;
+  for (int reduced_row = 0; reduced_row < assigned_rows; ++reduced_row) {
+    float accumulators[4][5];
+    // Undo the divion by the happens for the assembly version.
+    for (int i = 0; i < 4; ++i) {
+      for (int k = 0; k < 5; ++k) {
+        accumulators[i][k] = 4.f * static_cast<float>(*bias_ptr);
+      }
+      ++bias_ptr;
+    }
+    int reduced_col_count = *nnz_per_row++;
+    for (int c = 0; c < reduced_col_count; ++c) {
+      int col_delta = *col_deltas_bytes++ / sizeof(RhsType);
+      for (int k = 0; k < 5; ++k) rhs_ptrs[k] += col_delta;
+      // multiply this 4x4 block
+      for (int i = 0; i < 4; ++i) {
+        for (int j = 0; j < 4; ++j) {
+          for (int k = 0; k < 5; ++k) {
+            accumulators[i][k] += static_cast<float>(*weights_ptr) *
+                                  static_cast<float>(rhs_ptrs[k][j]);
+          }
+          weights_ptr++;
+        }
+      }
+    }
+    for (int k = 0; k < 5; ++k) {
+      for (int i = 0; i < 4; ++i) {
+        out_ptrs[k][0] = static_cast<OutType>(
+            relu ? std::max(accumulators[i][k], 0.f) : accumulators[i][k]);
+        out_ptrs[k]++;
+      }
+    }
+  }
+}
+// Performs the calculation y = A * x + b where A is a sparse matrix with
+// a 1x1 blocked pattern (ie unstructured), x is a
+// vector and b is vector.
+// Weights are stored for this routine in standard CSR format.  Each row must
+// have a multiple of 8 columns.
+// column indices are converted to deltas and then multiplied by 2 to convert
+// to bytes, so that the value can be used directly to offset the pointer
+// into the rhs vector.
+// NOTE: The bias is expected to have be multiplied by .25f prior to calling
+// this function.  This is automatically taken care of in SparseLinearLayer.
+// The bias is reconstructed through horizontal additions, leads to a small
+// speedup by reducing latencies at the end of the loop.
+template <typename WeightType, typename RhsType, typename OutType>
+typename std::enable_if<
+    ShouldEnableGenericSpMV_1x1<WeightType, RhsType, OutType>::value>::type
+SpMV_1x1(const WeightType* weights_ptr, const int16_t* col_deltas_bytes,
+         const int32_t* nnz_per_row, const RhsType* rhs_ptr,
+         const typename TypeOfProduct<WeightType, RhsType>::type* bias_ptr,
+         OutType* out_ptr, int64_t assigned_rows,
+         int64_t rows /* only used in SpMM variants */,
+         int64_t cols /* only used in SpMM variants */, int relu) {
+  for (int row = 0; row < assigned_rows; ++row) {
+    // Undo the divion by the happens for the assembly version.
+    float accumulator = 4.f * static_cast<float>(*bias_ptr++);
+    int col_count = *nnz_per_row++;
+    for (int c = 0; c < col_count; ++c) {
+      int col_delta = *col_deltas_bytes++ / sizeof(RhsType);
+      rhs_ptr += col_delta;
+      accumulator +=
+          static_cast<float>(*weights_ptr++) * static_cast<float>(*rhs_ptr);
+    }
+    *out_ptr++ =
+        static_cast<OutType>(relu ? std::max(accumulator, 0.f) : accumulator);
+  }
+}
+// Performs the calculation y = A * x + b where A is a sparse matrix with
+// a 1x1 blocked pattern (ie unstructured), x is a
+// vector and b is vector.
+// Weights are stored for this routine in standard CSR format.  Each row must
+// have a multiple of 8 columns.
+// column indices are converted to deltas and then multiplied by 2 to convert
+// to bytes, so that the value can be used directly to offset the pointer
+// into the rhs vector.
+// NOTE: The bias is expected to have be multiplied by .25f prior to calling
+// this function.  This is automatically taken care of in SparseLinearLayer.
+// The bias is reconstructed through horizontal additions, leads to a small
+// speedup by reducing latencies at the end of the loop.
+template <typename WeightType, typename RhsType, typename OutType>
+typename std::enable_if<
+    ShouldEnableGenericSpMM5_1x1<WeightType, RhsType, OutType>::value>::type
+SpMM5_1x1(const WeightType* weights_ptr, const int16_t* col_deltas_bytes,
+          const int32_t* nnz_per_row, const RhsType* rhs_ptr,
+          const typename TypeOfProduct<WeightType, RhsType>::type* bias_ptr,
+          OutType* out_ptr, int64_t assigned_rows, int64_t rows, int64_t cols,
+          int relu) {
+  const RhsType* rhs_ptrs[5];
+  for (int i = 0; i < 5; ++i) rhs_ptrs[i] = rhs_ptr + i * cols;
+  OutType* out_ptrs[5];
+  for (int i = 0; i < 5; ++i) out_ptrs[i] = out_ptr + i * rows;
+  for (int row = 0; row < assigned_rows; ++row) {
+    // Undo the divion by the happens for the assembly version.
+    float accumulator[5];
+    for (int i = 0; i < 5; ++i)
+      accumulator[i] = 4.f * static_cast<float>(*bias_ptr);
+    ++bias_ptr;
+    int col_count = *nnz_per_row++;
+    for (int c = 0; c < col_count; ++c) {
+      int col_delta = *col_deltas_bytes++ / sizeof(RhsType);
+      for (int i = 0; i < 5; ++i) {
+        rhs_ptrs[i] += col_delta;
+        accumulator[i] += static_cast<float>(*weights_ptr) *
+                          static_cast<float>(rhs_ptrs[i][0]);
+      }
+      weights_ptr++;
+    }
+    for (int i = 0; i < 5; ++i) {
+      out_ptrs[i][0] = static_cast<OutType>(relu ? std::max(accumulator[i], 0.f)
+                                                 : accumulator[i]);
+      out_ptrs[i]++;
+    }
+  }
+}
+template <typename Type>
+typename std::enable_if<ShouldEnableGenericAdd<Type>::value>::type SumVectors(
+    int start, int end, const Type* add1, const Type* add2, Type* result) {
+  LOG_FIRST_N(WARNING, 1) << "SumVectors: using generic kernel!";
+  for (int i = start; i < end; ++i) {
+    Type sum = static_cast<Type>(static_cast<float>(add1[i]) +
+                                 static_cast<float>(add2[i]));
+    result[i] = sum;
+  }
+}
+}  // namespace detail
+}  // namespace csrblocksparse
+#undef LABEL_COL_LOOP
+#undef LABEL_ROW_LOOP
+#undef LABEL_SKIP_COL_LOOP
+#undef LABEL_TOP_LOOP
+#endif  // LYRA_CODEC_SPARSE_MATMUL_COMPUTE_KERNELS_GENERIC_H_

sparse_matmul/compute/matmul.h ADDED Viewed

	@@ -0,0 +1,199 @@

+/*
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef LYRA_CODEC_SPARSE_MATMUL_COMPUTE_MATMUL_H_
+#define LYRA_CODEC_SPARSE_MATMUL_COMPUTE_MATMUL_H_
+#include <cstdint>
+#include <vector>
+#include "absl/time/time.h"
+#include "sparse_matmul/compute/matmul_fixed_avx2.h"
+#include "sparse_matmul/compute/matmul_generic.h"
+#include "sparse_matmul/numerics/fixed_types.h"
+#include "sparse_matmul/numerics/type_utils.h"
+#if defined(__x86_64__) || defined(__i386__) || defined(_WIN32)
+#include <cpuid.h>
+#endif
+namespace csrblocksparse {
+// The number of elements in a block.
+constexpr int kBlockSize = 4;
+// Base class for Matmul containing the members that are non type-specicfic.
+class MatmulBase {
+ public:
+  // Constructor initializes the flags that determine which implementation to
+  // use at run-time, constrained by both compiler flags and cpuid.
+  MatmulBase() {
+#if defined(__x86_64__) || defined(__i386__) || defined(_WIN32)
+    // Code tested to work on Linux systems and multiple Android emulators.
+    unsigned int eax, ebx, ecx, edx;
+    if (__get_cpuid(1, &eax, &ebx, &ecx, &edx) != 0) {
+      using_avx_ = (ecx & bit_AVX) != 0;
+      if (using_avx_) {
+        __get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
+        using_avx2_ = (ebx & bit_AVX2) != 0;
+        using_avx512_ = (ebx & bit_AVX512F) != 0 && (ebx & bit_AVX512DQ) &&
+                        (ebx & bit_AVX512BW) != 0;
+        VLOG(2) << "avx2 flag=" << using_avx2_ << " 512=" << using_avx512_;
+      } else {
+        LOG(ERROR) << "AVX not found at all!";
+      }
+    }
+#else
+    using_aarch64_ = true;
+#endif
+  }
+ protected:
+  // Flags that define what (runtime) architectures are available. Flags that
+  // are set are limited by both the compiler flags and runtime environment.
+  bool using_avx512_ = false;
+  bool using_avx2_ = false;
+  bool using_avx_ = false;
+  bool using_aarch64_ = false;
+};
+// The master template is really a catch-all for the unimplmented cases to
+// report an error.
+template <typename WeightType, typename RhsType>
+class Matmul : public MatmulBase {
+ public:
+  // Sparse inputs, outputs replicated strided for each thread.
+  template <typename OutType>
+  void MatVec4x4(const WeightType* weights, const RhsType* rhs,
+                 const typename TypeOfProduct<WeightType, RhsType>::type* bias,
+                 const int32_t* nnz_per_row, const int16_t* rhs_indices,
+                 int start_row, int end_row, bool relu, int replicas,
+                 int stride, OutType* output) {
+    // The specializations should take care of every real case.
+    CHECK(false) << "Unsupported combination of types used!";
+  }
+  template <typename OutType>
+  void MatVec8x4(const WeightType* weights, const RhsType* rhs,
+                 const typename TypeOfProduct<WeightType, RhsType>::type* bias,
+                 const int32_t* nnz_per_row, const int16_t* rhs_indices,
+                 int start_row, int end_row, bool relu, int replicas,
+                 int stride, OutType* output) {
+    // The specializations should take care of every real case.
+    CHECK(false) << "Unsupported combination of types used!";
+  }
+};
+// Full specialization for float.
+template <>
+class Matmul<float, float> : public MatmulBase {
+ public:
+  void MatVec4x4(const float* weights, const float* rhs, const float* bias,
+                 const int32_t* nnz_per_row, const int16_t* rhs_indices,
+                 int start_row, int end_row, bool relu, int replicas,
+                 int stride, float* output) {
+    detail::MatVecFloatGeneric(weights, rhs, bias, nnz_per_row, rhs_indices,
+                               start_row, end_row, /*block_height=*/4,
+                               /*block_width=*/4, relu, replicas, stride,
+                               output);
+  }
+  void MatVec8x4(const float* weights, const float* rhs, const float* bias,
+                 const int32_t* nnz_per_row, const int16_t* rhs_indices,
+                 int start_row, int end_row, bool relu, int replicas,
+                 int stride, float* output) {
+    detail::MatVecFloatGeneric(weights, rhs, bias, nnz_per_row, rhs_indices,
+                               start_row, end_row, /*block_height=*/8,
+                               /*block_width=*/4, relu, replicas, stride,
+                               output);
+  }
+};
+// Partial specialization for fixed types. Covers fixed16xfixed16 = OutType,
+// where OutType should be fixed16 or fixed32. The mantissa bits don't have
+// to match.
+template <int WeightBits, int RhsBits>
+class Matmul<fixed16<WeightBits>, fixed16<RhsBits>> : public MatmulBase {
+ public:
+  using WeightType = fixed16<WeightBits>;
+  using RhsType = fixed16<RhsBits>;
+  template <typename OutType>
+  void MatVec4x4(const int16_t* weights, const int16_t* rhs,
+                 const int32_t* bias, const int32_t* nnz_per_row,
+                 const int16_t* rhs_indices, int start_row, int end_row,
+                 bool relu, int replicas, int stride, OutType* output) {
+    constexpr int kShiftAmount =
+        TypeOfProduct<WeightType, RhsType>::type::kMantissaBits -
+        OutType::kMantissaBits;
+    static_assert(kShiftAmount >= 0,
+                  "OutType must not have more mantissa bits than inputs");
+#if defined __AVX2__
+    CHECK(using_avx2_) << "Compiled for AVX2, but cpu flag not set!";
+    if (sizeof(*output) == 4) {
+      int32_t* out32 = reinterpret_cast<int32_t*>(output);
+      detail::MatVec4x4FixedAVX2(weights, rhs, bias, nnz_per_row, rhs_indices,
+                                 start_row, end_row, relu, kShiftAmount,
+                                 replicas, stride, out32);
+    } else {
+      int16_t* out16 = reinterpret_cast<int16_t*>(output);
+      detail::MatVec4x4FixedAVX2(weights, rhs, bias, nnz_per_row, rhs_indices,
+                                 start_row, end_row, relu, kShiftAmount,
+                                 replicas, stride, out16);
+    }
+#elif defined __aarch64__
+    if (using_aarch64_) {
+      LOG(FATAL) << "Fixed16 MatVec4x4 not yet implemented!";
+    }
+#else
+    detail::MatVecFixedGeneric(weights, rhs, bias, nnz_per_row, rhs_indices,
+                               start_row, end_row, /*block_height=*/4,
+                               /*block_width=*/4, relu, sizeof(*output),
+                               kShiftAmount, replicas, stride, output);
+#endif  // __AVX2__
+  }
+  template <typename OutType>
+  void MatVec8x4(const int16_t* weights, const int16_t* rhs,
+                 const int32_t* bias, const int32_t* nnz_per_row,
+                 const int16_t* rhs_indices, int start_row, int end_row,
+                 bool relu, int replicas, int stride, OutType* output) {
+    constexpr int kShiftAmount =
+        TypeOfProduct<WeightType, RhsType>::type::kMantissaBits -
+        OutType::kMantissaBits;
+    static_assert(kShiftAmount >= 0,
+                  "OutType must not have more mantissa bits than inputs");
+#if defined __AVX2__
+    CHECK(replicas == 1 && sizeof(*output) == 4)
+        << "Only replicas == 1 and fixed32 output are implemented for AVX2!";
+    CHECK(using_avx2_) << "Compiled for AVX2, but cpu flag not set!";
+    int32_t* out32 = reinterpret_cast<int32_t*>(output);
+    detail::MatVec8x4FixedAVX2(weights, rhs, bias, nnz_per_row, rhs_indices,
+                               start_row, end_row, relu, kShiftAmount, out32);
+#elif defined __aarch64__
+    if (using_aarch64_) {
+      LOG(FATAL) << "Fixed16 MatVec8x4 not yet implemented!";
+    }
+#else
+    detail::MatVecFixedGeneric(weights, rhs, bias, nnz_per_row, rhs_indices,
+                               start_row, end_row, /*block_height=*/8,
+                               /*block_width=*/4, relu, sizeof(*output),
+                               kShiftAmount, replicas, stride, output);
+#endif  // __AVX2__
+  }
+};
+}  // namespace csrblocksparse
+#endif  // LYRA_CODEC_SPARSE_MATMUL_COMPUTE_MATMUL_H_

sparse_matmul/compute/matmul_fixed_avx2.cc ADDED Viewed

	@@ -0,0 +1,235 @@

+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "sparse_matmul/compute/matmul_fixed_avx2.h"
+#include <cstdint>
+#if defined __AVX__
+#include <immintrin.h>
+#endif
+#include "sparse_matmul/compute/matmul.h"
+namespace csrblocksparse {
+namespace detail {
+static const int32_t kint32min = static_cast<int32_t>(~0x7FFFFFFF);
+static const int32_t kint32max = static_cast<int32_t>(0x7FFFFFFF);
+#if defined __AVX2__
+// In-line function computes and returns the result of one row (of blocks) as
+// 4x int32_t. |weights_ptr| is a non-const reference so it can easily be
+// interpreted as belonging to the caller.
+inline __m256i ComputeRowResults(const __m128i& bias128, const int16_t* rhs,
+                                 const int16_t* rhs_indices, int nnz,
+                                 int16_t const*& weights_ptr) {
+  // Expand bias to 64 bits in a 256 bit register [0 z 1 z 2 z 3 z], where z is
+  // Zero and 0-3 are the 4x32 bit bias values.
+  __m256i sum = _mm256_cvtepu32_epi64(bias128);
+  for (int c = 0; c < nnz; ++c) {
+    int rhs_index = rhs_indices[c];
+    // Load all 16 weights.
+    __m256i weights =
+        _mm256_load_si256(reinterpret_cast<__m256i const*>(weights_ptr));
+    // Get the 4x int16_t into the bottom of |rhs_64|.
+    __m128i rhs_64 = _mm_loadl_epi64(
+        reinterpret_cast<__m128i const*>(rhs + rhs_index * kBlockSize));
+    // Broadcast the rhs, pretending that each is a 64-bit unit:
+    // [0123 0123 0123 0123].
+    __m256i rhs_value = _mm256_broadcastq_epi64(rhs_64);
+    weights_ptr += 16;
+    sum = _mm256_add_epi32(sum, _mm256_madd_epi16(weights, rhs_value));
+  }
+  // Horizontally add the results. We have 1 register that contains results
+  // [0 0 1 1 2 2 3 3], but hadd (and almost no other AVX instruction) will not
+  // cross lanes, so we end up with [0 1 0 1 2 3 2 3]
+  sum = _mm256_hadd_epi32(sum, sum);
+  // Permutes the middle two pairs to get the answers together.
+  return _mm256_permute4x64_epi64(sum, 0xd8);
+}
+// Template that allows any fixed combination of OutType and replicas, plus
+// variable |relu|, |shift_out|. Note that |kReplicas| is a template arg as
+// well as a function arg so we can hard-code a limited amount of unrolling.
+template <typename OutType, int kReplicas>
+void MatVec4x4FixedAVX2Template(const int16_t* weights_ptr, const int16_t* rhs,
+                                const int32_t* bias, const int32_t* nnz_per_row,
+                                const int16_t* rhs_indices, int start_row,
+                                int end_row, bool relu, int shift_out,
+                                int replicas, int stride, OutType* output) {
+  int rounding_addon = shift_out > 0 ? (1 << (shift_out - 1)) : 0;
+  __m256i rounding = _mm256_set1_epi32(rounding_addon);
+  __m256i zero = relu ? _mm256_setzero_si256() : _mm256_set1_epi32(kint32min);
+  for (int row_block = start_row; row_block < end_row; ++row_block) {
+    // Load 4 biases [0 1 2 3].
+    __m128i bias128 = _mm_load_si128(reinterpret_cast<__m128i const*>(bias));
+    bias += kBlockSize;
+    int nnz = nnz_per_row[row_block];
+    __m256i sum =
+        ComputeRowResults(bias128, rhs, rhs_indices, nnz, weights_ptr);
+    rhs_indices += nnz;
+    // Shift right with rounding to get the right number of mantissa bits.
+    sum = _mm256_add_epi32(sum, rounding);
+    sum = _mm256_srai_epi32(sum, shift_out);
+    // Now sum contains [res0, res1, res2, res3, res0, res1, res2, res3]
+    sum = _mm256_max_epi32(sum, zero);
+    if (sizeof(OutType) == 2) {
+      // Clip to 16 bit range (with saturation) and pack in the bottom 64
+      // bits. The 64 bit result is replicated across the whole 256 bit
+      // register. [0123 0123 0123 0123]
+      sum = _mm256_packs_epi32(sum, sum);
+      int64_t result = _mm256_extract_epi64(sum, 0);
+      *reinterpret_cast<int64_t*>(output) = result;
+      if (kReplicas > 1) {
+        *reinterpret_cast<int64_t*>(output + stride) = result;
+        if (kReplicas > 2) {
+          for (int r = 2; r < replicas; ++r) {
+            *reinterpret_cast<int64_t*>(output + r * stride) = result;
+          }
+        }
+      }
+    } else {
+      // Save the lower 128 bits (4x int32_t).
+      __m128i result = _mm256_extractf128_si256(sum, 0);
+      _mm_store_si128(reinterpret_cast<__m128i*>(output), result);
+      if (kReplicas > 1) {
+        _mm_store_si128(reinterpret_cast<__m128i*>(output + stride), result);
+        if (kReplicas > 2) {
+          for (int r = 2; r < replicas; ++r) {
+            _mm_store_si128(reinterpret_cast<__m128i*>(output + r * stride),
+                            result);
+          }
+        }
+      }
+    }
+    output += kBlockSize;
+  }
+}
+// Version that covers all possible combinations of the variable conditions:
+// |relu|, |shift_out|, |replicas|, with int16_t |output|.
+void MatVec4x4FixedAVX2(const int16_t* weights_ptr, const int16_t* rhs,
+                        const int32_t* bias, const int32_t* nnz_per_row,
+                        const int16_t* rhs_indices, int start_row, int end_row,
+                        bool relu, int shift_out, int replicas, int stride,
+                        int16_t* output) {
+  if (replicas <= 1) {
+    MatVec4x4FixedAVX2Template<int16_t, 1>(weights_ptr, rhs, bias, nnz_per_row,
+                                           rhs_indices, start_row, end_row,
+                                           relu, shift_out, 1, stride, output);
+  } else if (replicas == 2) {
+    MatVec4x4FixedAVX2Template<int16_t, 2>(weights_ptr, rhs, bias, nnz_per_row,
+                                           rhs_indices, start_row, end_row,
+                                           relu, shift_out, 2, stride, output);
+  } else {
+    MatVec4x4FixedAVX2Template<int16_t, 3>(
+        weights_ptr, rhs, bias, nnz_per_row, rhs_indices, start_row, end_row,
+        relu, shift_out, replicas, stride, output);
+  }
+}
+// Version that covers all possible combinations of the variable conditions:
+// |relu|, |shift_out|, |replicas|, with int32_t |output|.
+void MatVec4x4FixedAVX2(const int16_t* weights_ptr, const int16_t* rhs,
+                        const int32_t* bias, const int32_t* nnz_per_row,
+                        const int16_t* rhs_indices, int start_row, int end_row,
+                        bool relu, int shift_out, int replicas, int stride,
+                        int32_t* output) {
+  if (replicas <= 1) {
+    MatVec4x4FixedAVX2Template<int32_t, 1>(weights_ptr, rhs, bias, nnz_per_row,
+                                           rhs_indices, start_row, end_row,
+                                           relu, shift_out, 1, stride, output);
+  } else if (replicas == 2) {
+    MatVec4x4FixedAVX2Template<int32_t, 2>(weights_ptr, rhs, bias, nnz_per_row,
+                                           rhs_indices, start_row, end_row,
+                                           relu, shift_out, 2, stride, output);
+  } else {
+    MatVec4x4FixedAVX2Template<int32_t, 3>(
+        weights_ptr, rhs, bias, nnz_per_row, rhs_indices, start_row, end_row,
+        relu, shift_out, replicas, stride, output);
+  }
+}
+// In-line function computes and returns the result of one row (of blocks) as
+// 8x int32_t. weights_ptr is a non-const reference so it can easily be
+// interpreted as belonging to the caller.
+inline __m256i Compute8RowResults(const __m256i& bias256, const int16_t* rhs,
+                                  const int16_t* rhs_indices, int nnz,
+                                  int16_t const*& weights_ptr) {
+  // Expand bias to 64 bits in a 256 bit register [0 z 1 z 2 z 3 z], where z is
+  // Zero and 0-3 are the 4x32 bit bias values from 128 bit half of the input.
+  __m256i sum1 = _mm256_cvtepu32_epi64(_mm256_castsi256_si128(bias256));
+  // Plus 4 more in another sum register from the upper 128 bit half.
+  __m256i sum2 = _mm256_cvtepu32_epi64(_mm256_extractf128_si256(bias256, 1));
+  for (int c = 0; c < nnz; ++c) {
+    int rhs_index = rhs_indices[c];
+    // Load all 16 weights.
+    __m256i weights =
+        _mm256_load_si256(reinterpret_cast<__m256i const*>(weights_ptr));
+    // Get the 4x int16_t into the bottom of |rhs_64|.
+    __m128i rhs_64 = _mm_loadl_epi64(
+        reinterpret_cast<__m128i const*>(rhs + rhs_index * kBlockSize));
+    // Broadcast the rhs, pretending that each is a 64-bit unit:
+    // [0123 0123 0123 0123].
+    __m256i rhs_value = _mm256_broadcastq_epi64(rhs_64);
+    weights_ptr += 16;
+    sum1 = _mm256_add_epi32(sum1, _mm256_madd_epi16(weights, rhs_value));
+    // Same again for the other 4 results, re-using the same rhs value.
+    weights = _mm256_load_si256(reinterpret_cast<__m256i const*>(weights_ptr));
+    weights_ptr += 16;
+    sum2 = _mm256_add_epi32(sum2, _mm256_madd_epi16(weights, rhs_value));
+  }
+  // Horizontally add the results. We have 2 registers that contain results
+  // [0 0 1 1 2 2 3 3], and [4 4 5 5 6 6 7 7] but hadd (and almost no other AVX
+  // instruction) will not cross lanes, so we end up with [0 1 4 5 2 3 6 7]
+  sum1 = _mm256_hadd_epi32(sum1, sum2);
+  // Permutes the middle two pairs to get the answers in the right order.
+  return _mm256_permute4x64_epi64(sum1, 0xd8);
+}
+// Version that covers the main conditions used with 8x4:
+// |relu|, |shift_out|, with int32_t |output|.
+void MatVec8x4FixedAVX2(const int16_t* weights_ptr, const int16_t* rhs,
+                        const int32_t* bias, const int32_t* nnz_per_row,
+                        const int16_t* rhs_indices, int start_row, int end_row,
+                        bool relu, int shift_out, int32_t* output) {
+  int rounding_addon = shift_out > 0 ? (1 << (shift_out - 1)) : 0;
+  __m256i rounding = _mm256_set1_epi32(rounding_addon);
+  __m256i zero = relu ? _mm256_setzero_si256() : _mm256_set1_epi32(kint32min);
+  for (int row_block = start_row; row_block < end_row; ++row_block) {
+    // Load 4 biases [0 1 2 3 4 5 6 7].
+    __m256i bias256 = _mm256_load_si256(reinterpret_cast<__m256i const*>(bias));
+    bias += kBlockSize * 2;
+    int nnz = nnz_per_row[row_block];
+    __m256i sum =
+        Compute8RowResults(bias256, rhs, rhs_indices, nnz, weights_ptr);
+    rhs_indices += nnz;
+    // Shift right with rounding to get the right number of mantissa bits.
+    sum = _mm256_add_epi32(sum, rounding);
+    sum = _mm256_srai_epi32(sum, shift_out);
+    // Now sum contains [res0, res1, res2, res3, res0, res1, res2, res3]
+    sum = _mm256_max_epi32(sum, zero);
+    // Save the all 256 bits (8x int32_t).
+    _mm256_store_si256(reinterpret_cast<__m256i*>(output), sum);
+    output += kBlockSize * 2;
+  }
+}
+#endif
+}  // namespace detail
+}  // namespace csrblocksparse

sparse_matmul/compute/matmul_fixed_avx2.h ADDED Viewed

	@@ -0,0 +1,49 @@

+/*
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef LYRA_CODEC_SPARSE_MATMUL_COMPUTE_MATMUL_FIXED_AVX2_H_
+#define LYRA_CODEC_SPARSE_MATMUL_COMPUTE_MATMUL_FIXED_AVX2_H_
+#include <cstdint>
+namespace csrblocksparse {
+namespace detail {
+// Version that covers all possible combinations of the variable conditions:
+// |relu|, |shift_out|, |replicas|, with int16 output.
+void MatVec4x4FixedAVX2(const int16_t* weights_ptr, const int16_t* rhs,
+                        const int32_t* bias, const int32_t* nnz_per_row,
+                        const int16_t* rhs_indices, int start_row, int end_row,
+                        bool relu, int shift_out, int replicas, int stride,
+                        int16_t* output);
+// Version that covers all possible combinations of the variable conditions:
+// |relu|, |shift_out|, |replicas|, with int32 output.
+void MatVec4x4FixedAVX2(const int16_t* weights_ptr, const int16_t* rhs,
+                        const int32_t* bias, const int32_t* nnz_per_row,
+                        const int16_t* rhs_indices, int start_row, int end_row,
+                        bool relu, int shift_out, int replicas, int stride,
+                        int32_t* output);
+// Version that covers the main conditions used with 8x4:
+// |relu|, |shift_out|, with int32 output.
+void MatVec8x4FixedAVX2(const int16_t* weights_ptr, const int16_t* rhs,
+                        const int32_t* bias, const int32_t* nnz_per_row,
+                        const int16_t* rhs_indices, int start_row, int end_row,
+                        bool relu, int shift_out, int32_t* output);
+}  // namespace detail
+}  // namespace csrblocksparse
+#endif  // LYRA_CODEC_SPARSE_MATMUL_COMPUTE_MATMUL_FIXED_AVX2_H_

sparse_matmul/compute/matmul_generic.cc ADDED Viewed

	@@ -0,0 +1,122 @@

+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "sparse_matmul/compute/matmul_generic.h"
+#include <cstdint>
+#include <vector>
+#include "sparse_matmul/compute/matmul.h"
+namespace csrblocksparse {
+namespace detail {
+void MatVecFloatGeneric(const float* weights, const float* rhs,
+                        const float* bias, const int32_t* nnz_per_row,
+                        const int16_t* rhs_indices, int start_row, int end_row,
+                        int block_height, int block_width, bool relu,
+                        int replicas, int stride, float* output) {
+  int weight_index = 0;
+  int bias_index = 0;
+  std::vector<float> accumulators(block_height);
+  for (int row_block = start_row; row_block < end_row;
+       ++row_block, output += block_height) {
+    int nnz = nnz_per_row[row_block];
+    // Biases are now stored and used directly without pre-division.
+    for (int i = 0; i < block_height; ++i) accumulators[i] = bias[bias_index++];
+    for (int c = 0; c < nnz; ++c) {
+      int rhs_index = rhs_indices[c];
+      const float* block_rhs = rhs + rhs_index * block_width;
+      // Multiply this |block_height| x |block_width| block.
+      for (int i = 0; i < block_height; ++i) {
+        for (int j = 0; j < block_width; ++j) {
+          accumulators[i] += weights[weight_index++] * block_rhs[j];
+        }
+      }
+    }
+    rhs_indices += nnz;
+    // Apply relu if desired.
+    if (relu) {
+      for (int i = 0; i < block_height; ++i) {
+        if (accumulators[i] < 0) accumulators[i] = 0;
+      }
+    }
+    for (int r = 0; r < replicas; ++r) {
+      for (int i = 0; i < block_height; ++i) {
+        output[i + r * stride] = accumulators[i];
+      }
+    }
+  }
+}
+void MatVecFixedGeneric(const int16_t* weights, const int16_t* rhs,
+                        const int32_t* bias, const int32_t* nnz_per_row,
+                        const int16_t* rhs_indices, int start_row, int end_row,
+                        int block_height, int block_width, bool relu,
+                        int bytes_out, int shift_out, int replicas, int stride,
+                        void* output) {
+  int weight_index = 0;
+  int bias_index = 0;
+  std::vector<int32_t> accumulators(block_height);
+  for (int row_block = start_row; row_block < end_row; ++row_block) {
+    int nnz = nnz_per_row[row_block];
+    // Biases are now stored and used directly without pre-division.
+    for (int i = 0; i < block_height; ++i) accumulators[i] = bias[bias_index++];
+    for (int c = 0; c < nnz; ++c) {
+      int rhs_index = rhs_indices[c];
+      const int16_t* block_rhs = rhs + rhs_index * block_width;
+      // Multiply this |block_height| x |block_width| block.
+      for (int i = 0; i < block_height; ++i) {
+        for (int j = 0; j < block_width; ++j) {
+          accumulators[i] += weights[weight_index++] * block_rhs[j];
+        }
+      }
+    }
+    rhs_indices += nnz;
+    // Apply relu if desired.
+    if (relu) {
+      for (int i = 0; i < block_height; ++i) {
+        if (accumulators[i] < 0) accumulators[i] = 0;
+      }
+    }
+    // Output shift.
+    if (shift_out > 0) {
+      for (int i = 0; i < block_height; ++i) {
+        accumulators[i] >>= shift_out;
+      }
+    }
+    if (bytes_out == 2) {
+      int16_t* out16 = reinterpret_cast<int16_t*>(output);
+      output = out16 + block_height;
+      for (int r = 0; r < replicas; ++r, out16 += stride) {
+        for (int i = 0; i < block_height; ++i) {
+          out16[i] = accumulators[i];
+        }
+      }
+    } else {
+      int32_t* out32 = reinterpret_cast<int32_t*>(output);
+      output = out32 + block_height;
+      for (int r = 0; r < replicas; ++r, out32 += stride) {
+        for (int i = 0; i < block_height; ++i) {
+          out32[i] = accumulators[i];
+        }
+      }
+    }
+  }
+}
+}  // namespace detail
+}  // namespace csrblocksparse

sparse_matmul/compute/matmul_generic.h ADDED Viewed

	@@ -0,0 +1,41 @@

+/*
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef LYRA_CODEC_SPARSE_MATMUL_COMPUTE_MATMUL_GENERIC_H_
+#define LYRA_CODEC_SPARSE_MATMUL_COMPUTE_MATMUL_GENERIC_H_
+#include <cstdint>
+namespace csrblocksparse {
+namespace detail {
+// Generic version uses plain C++ code.
+void MatVecFloatGeneric(const float* weights, const float* rhs,
+                        const float* bias, const int32_t* nnz_per_row,
+                        const int16_t* rhs_indices, int start_row, int end_row,
+                        int block_height, int block_width, bool relu,
+                        int replicas, int stride, float* output);
+void MatVecFixedGeneric(const int16_t* weights, const int16_t* rhs,
+                        const int32_t* bias, const int32_t* nnz_per_row,
+                        const int16_t* rhs_indices, int start_row, int end_row,
+                        int block_height, int block_width, bool relu,
+                        int bytes_out, int shift_out, int replicas, int stride,
+                        void* output);
+}  // namespace detail
+}  // namespace csrblocksparse
+#endif  // LYRA_CODEC_SPARSE_MATMUL_COMPUTE_MATMUL_GENERIC_H_

sparse_matmul/compute/thread_bounds.cc ADDED Viewed

	@@ -0,0 +1,106 @@

+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "sparse_matmul/compute/thread_bounds.h"
+#include <vector>
+#include "glog/logging.h"
+namespace csrblocksparse {
+void ThreadBounds::PrepareForThreads(int block_width, int block_height,
+                                     int num_threads,
+                                     int reduced_rows_per_cache_row,
+                                     int reduced_rows, const int* nnz_per_row) {
+  CHECK_GT(num_threads, 0);
+  block_width_ = block_width;
+  block_height_ = block_height;
+  ComputeThreadSplitPoints(num_threads, reduced_rows_per_cache_row,
+                           reduced_rows, nnz_per_row);
+  weight_starts_.clear();
+  rhs_indices_starts_.clear();
+  bias_starts_.clear();
+  weight_starts_.reserve(row_starts_.size());
+  rhs_indices_starts_.reserve(row_starts_.size());
+  bias_starts_.reserve(row_starts_.size());
+  // Compute the start indices of each of the types, given what we know about
+  // padding, and number of |nnz_per_row|.
+  int weight_index = 0;
+  int rhs_indices_index = 0;
+  int bias_index = 0;
+  int row = 0;
+  for (int start : row_starts_) {
+    while (row < start) {
+      weight_index += nnz_per_row[row] * block_width_ * block_height_;
+      rhs_indices_index += nnz_per_row[row];
+      bias_index += block_height_;
+      ++row;
+    }
+    weight_starts_.push_back(weight_index);
+    rhs_indices_starts_.push_back(rhs_indices_index);
+    bias_starts_.push_back(bias_index);
+  }
+}
+// Computes the block row (reduced) index of the start of each thread.
+void ThreadBounds::ComputeThreadSplitPoints(int num_threads,
+                                            int reduced_rows_per_cache_row,
+                                            int reduced_rows,
+                                            const int* nnz_per_row) {
+  row_starts_.assign(/*n=*/1, /*val=*/0);
+  // Break the rule if the matrix is too small to allow one per thread, which
+  // occurs only during tests.
+  if (reduced_rows_per_cache_row * num_threads > reduced_rows)
+    reduced_rows_per_cache_row = std::max(reduced_rows / num_threads, 1);
+  int cache_rows = (reduced_rows + reduced_rows_per_cache_row - 1) /
+                   reduced_rows_per_cache_row;
+  // Compute exclusive prefix sum of the amount of work per row.
+  std::vector<int> work_upto_row(cache_rows + 1, 0);
+  int extra_row_work = 2 * reduced_rows_per_cache_row;
+  for (int i = 0; i < cache_rows; ++i) {
+    int new_nnz = 0;
+    for (int j = 0; j < reduced_rows_per_cache_row; ++j) {
+      // if |reduced_rows_per_cache_row| isn't an exact multiple of the
+      // matrix size, then we need to be careful here.
+      int index = i * reduced_rows_per_cache_row + j;
+      if (index < reduced_rows) new_nnz += nnz_per_row[index];
+    }
+    work_upto_row[i + 1] = new_nnz + extra_row_work + work_upto_row[i];
+  }
+  int total_work = work_upto_row.back();
+  // Find the split point point based on assigned approximately equal amount
+  // of work for each thread.
+  int prev_split = 0;
+  for (int i = 1; i <= num_threads; ++i) {
+    int split = std::distance(
+        work_upto_row.begin(),
+        std::lower_bound(work_upto_row.begin(), work_upto_row.end(),
+                         i * total_work / num_threads));
+    int split_row = split * reduced_rows_per_cache_row;
+    if (i == num_threads) {
+      split_row = reduced_rows;
+    }
+    VLOG(2) << "tid=" << i - 1 << " num rows=" << split_row - row_starts_.back()
+            << " work=" << work_upto_row[split] - work_upto_row[prev_split];
+    row_starts_.push_back(split_row);
+    prev_split = split;
+  }
+  VLOG(2) << "total rows=" << reduced_rows << " total work=" << total_work;
+}
+}  // namespace csrblocksparse

sparse_matmul/compute/thread_bounds.h ADDED Viewed

	@@ -0,0 +1,74 @@

+/*
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef LYRA_CODEC_SPARSE_MATMUL_COMPUTE_THREAD_BOUNDS_H_
+#define LYRA_CODEC_SPARSE_MATMUL_COMPUTE_THREAD_BOUNDS_H_
+#include <vector>
+namespace csrblocksparse {
+// Class to compute and store the bounds of each thread used in a computation,
+// and to provide corresponding spans of vectors.
+class ThreadBounds {
+ public:
+  ThreadBounds() : block_width_(0), block_height_(0) {}
+  void PrepareForThreads(int block_width, int block_height, int num_threads,
+                         int reduced_rows_per_cache_row, int reduced_rows,
+                         const int* nnz_per_row);
+  // Functions that offset the appropriate type to the start of the data
+  // needed by the given thread id (|tid|).
+  template <typename WeightType>
+  const WeightType* OffsetWeights(const WeightType* weights, int tid) const {
+    return weights + weight_starts_[tid];
+  }
+  template <typename RhsIndType>
+  const RhsIndType* OffsetRhsIndices(const RhsIndType* rhs_indices,
+                                     int tid) const {
+    return rhs_indices + rhs_indices_starts_[tid];
+  }
+  template <typename BiasType>
+  const BiasType* OffsetBias(const BiasType* bias, int tid) const {
+    return bias + bias_starts_[tid];
+  }
+  template <typename OutType>
+  OutType* OffsetOutput(OutType* output, int tid) const {
+    return output + block_height_ * row_starts_[tid];
+  }
+  int StartRow(int tid) const { return row_starts_[tid]; }
+  const std::vector<int>& row_starts() const { return row_starts_; }
+ private:
+  // Computes the block row (reduced) index of the start of each thread.
+  void ComputeThreadSplitPoints(int num_threads, int reduced_rows_per_cache_row,
+                                int reduced_rows, const int* nnz_per_row);
+  // Sizes of a sparse block.
+  int block_width_;
+  int block_height_;
+  // Start indices of each data type by thread-id with an extra value at the
+  // end.
+  std::vector<int> row_starts_;
+  std::vector<int> weight_starts_;
+  std::vector<int> rhs_indices_starts_;
+  std::vector<int> bias_starts_;
+};
+}  // namespace csrblocksparse
+#endif  // LYRA_CODEC_SPARSE_MATMUL_COMPUTE_THREAD_BOUNDS_H_

sparse_matmul/layers/BUILD ADDED Viewed

	@@ -0,0 +1,146 @@

+# Sparse/Masked Matrix and Layer.
+# [internal] load android_library_selector
+# [internal] load android_cc_test:def.bzl
+licenses(["notice"])
+cc_library(
+    name = "layer",
+    hdrs = [
+        "sparse_linear_layer.h",
+    ],
+    visibility = [
+        "//sparse_matmul:__subpackages__",
+    ],
+    deps = [
+        ":matrix",
+        "//sparse_matmul/numerics:types",
+        "//sparse_matmul/os:coop_threads",
+        "//sparse_matmul/vector:cache_aligned_vector",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_glog//:glog",
+    ],
+)
+cc_library(
+    name = "matrix",
+    hdrs = [
+        "csr_blocksparse_matrix.h",
+        "masked_sparse_matrix.h",
+    ],
+    visibility = [
+        "//sparse_matmul:__subpackages__",
+    ],
+    deps = [
+        "//sparse_matmul/compute:kernels",
+        "//sparse_matmul/compute:matmul",
+        "//sparse_matmul/compute:thread_bounds",
+        "//sparse_matmul/numerics:types",
+        "//sparse_matmul/os:coop_threads",
+        "//sparse_matmul/vector:cache_aligned_vector",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_glog//:glog",
+    ],
+)
+cc_library(
+    name = "utils",
+    srcs = [
+        "utils.cc",
+    ],
+    hdrs = [
+        "read_array_ifstream.h",
+        "utils.h",
+    ],
+    visibility = [
+        "//sparse_matmul:__subpackages__",
+    ],
+    deps = [
+        ":layer",
+        ":matrix",
+        ":status",
+        "//sparse_matmul/numerics:types",
+        "//sparse_matmul/vector:cache_aligned_vector",
+        "//sparse_matmul/zlib_wrapper",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:cord",
+        "@gulrak_filesystem//:filesystem",
+    ],
+)
+cc_library(
+    name = "status",
+    srcs = [
+        "errno_mapping.cc",
+    ],
+    hdrs = [
+        "errno_mapping.h",
+        "status_macros.h",
+    ],
+    deps = [
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:cord",
+    ],
+)
+cc_test(
+    name = "csrblocksparse_test",
+    size = "small",
+    srcs = [
+        "csrblocksparse_test.cc",
+    ],
+    data = glob(["testdata/*"]),
+    linkopts = select({
+        "@bazel_tools//platforms:android": ["-landroid"],
+        "//conditions:default": [],
+    }),
+    shard_count = 10,
+    deps = [
+        ":status",
+        ":utils",
+        "//sparse_matmul/compute:matmul",
+        "//sparse_matmul/numerics:test_utils",
+        "//sparse_matmul/os:coop_threads",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+        "@gulrak_filesystem//:filesystem",
+    ],
+)
+cc_test(
+    name = "sparse_linear_layer_test",
+    srcs = [
+        "sparse_linear_layer_test.cc",
+    ],
+    deps = [
+        ":layer",
+        "//sparse_matmul/numerics:test_utils",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+cc_test(
+    name = "utils_test",
+    srcs = ["utils_test.cc"],
+    deps = [
+        ":layer",
+        ":matrix",
+        ":status",
+        ":utils",
+        "//sparse_matmul/numerics:fast_transcendentals",
+        "//sparse_matmul/numerics:test_utils",
+        "//sparse_matmul/numerics:types",
+        "//sparse_matmul/vector:cache_aligned_vector",
+        "@com_google_absl//absl/flags:flag",
+        "@com_google_googletest//:gtest_main",
+        "@gulrak_filesystem//:filesystem",
+    ],
+)

sparse_matmul/layers/csr_blocksparse_matrix.h ADDED Viewed

	@@ -0,0 +1,835 @@

+/*
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef LYRA_CODEC_SPARSE_MATMUL_LAYERS_CSR_BLOCKSPARSE_MATRIX_H_
+#define LYRA_CODEC_SPARSE_MATMUL_LAYERS_CSR_BLOCKSPARSE_MATRIX_H_
+#include <algorithm>
+#include <cstdint>
+#include <iostream>
+#include <memory>
+#include <tuple>
+#include <vector>
+#include "glog/logging.h"
+// IWYU pragma: begin_exports
+#include "sparse_matmul/compute/kernels_generic.h"
+#include "sparse_matmul/compute/matmul.h"
+#include "sparse_matmul/compute/thread_bounds.h"
+#include "sparse_matmul/layers/masked_sparse_matrix.h"
+#include "sparse_matmul/numerics/fixed_types.h"
+#include "sparse_matmul/numerics/float16_types.h"
+#include "sparse_matmul/os/coop_threads.h"
+#include "sparse_matmul/vector/cache_aligned_vector.h"
+// IWYU pragma: end_exports
+#include "absl/memory/memory.h"
+namespace csrblocksparse {
+// CsrBlockSparseMatrix stores a modified block compressed sparse row
+// representation of a sparse matrix.  The ordering of the weights is modified
+// in the 16x1 and 1x1 cases so that a certain number (4 and 8 respectively)
+// of columns of weights are stored contiguously before moving on to the next
+// row.  The 4x4 case stores each block contiguously.
+//
+// Currently it is constructed from a MaskedSparseMatrix which usees a dense
+// binary mask representation.  The construction generates the compressed
+// representation.  Further iterations will support a direct serialization
+// of the compressed representation.
+//
+// MaskedSparseMatrix masked_matrix(rows, cols, existing_mask, existing_values)
+// CsrBlockSparseMatrix matrix(masked_matrix)
+//
+// matrix.SpMV_bias(rhs, bias, &out);
+//
+// This class is thread compatible.
+template <typename WeightType, typename RhsType, typename DeltaType = int16_t>
+class CsrBlockSparseMatrix {
+ public:
+  CsrBlockSparseMatrix() {}
+  // Reference used to indicate that this is an input and not an output.
+  CsrBlockSparseMatrix(const uint8_t* const& buffer, const std::size_t& len) {
+    ReadFromFlatBuffer(buffer, len);
+    ComputeRHSIndices();
+  }
+  template <typename InputType>
+  CsrBlockSparseMatrix(const MaskedSparseMatrix<InputType>& masked_matrix) {
+    sparsity_ = masked_matrix.sparsity();
+    rows_ = masked_matrix.rows();
+    cols_ = masked_matrix.cols();
+    DetermineBlockSize(masked_matrix);
+    if (block_width_ == 1 && block_height_ == 1)
+      col_multiple_ = 8;
+    else
+      col_multiple_ = 1;
+    std::vector<InputType> weights(masked_matrix.values().begin(),
+                                   masked_matrix.values().end());
+    reduced_rows_ = (rows_ + block_height_ - 1) / block_height_;
+    rows_ = reduced_rows_ * block_height_;
+    reduced_cols_ = cols_ / block_width_;
+    // Calculate the reduced CSR representation of the matrix.
+    std::vector<int> reduced_mask(reduced_rows_ * reduced_cols_);
+    std::vector<int> row_offsets = {0};
+    int nnz = 0;
+    const auto& mask = masked_matrix.mask();
+    for (int r = 0; r < reduced_rows_; ++r) {
+      for (int c = 0; c < reduced_cols_; ++c) {
+        int mask_val = mask[r * block_height_ * cols_ + c * block_width_];
+        reduced_mask[r * reduced_cols_ + c] = mask_val;
+        nnz += mask_val;
+      }
+      row_offsets.push_back(nnz);
+    }
+    // Make sure the reduced representation has the correct number of columns.
+    MakeColumnsMultiple(row_offsets, &reduced_mask, &weights);
+    std::vector<int> col_indices;
+    std::vector<WeightType> weights_csr;
+    std::vector<int> nnz_per_row;
+    MaskAndWeightsToCsr(reduced_mask, weights, &nnz_per_row, &col_indices,
+                        &weights_csr);
+    // Generate column deltas from |col_indices|.
+    std::vector<DeltaType> col_deltas;
+    for (int i = 0; i < col_indices.size(); ++i) {
+      // |col_indices| are used to index the RHS vector which is always float.
+      int64_t diff = sizeof(RhsType);
+      if (i == 0)
+        diff *= block_width_ * (col_indices[i]);
+      else
+        diff *= block_width_ * (col_indices[i] - col_indices[i - 1]);
+      CHECK(diff < std::numeric_limits<DeltaType>::max())
+          << "delta between column indices in bytes " << diff
+          << " exceeded the maximum size of the DeltaType "
+          << std::numeric_limits<DeltaType>::max();
+      col_deltas.push_back(static_cast<DeltaType>(diff));
+    }
+    // Because of pre-fetching we need some extra values at the end.
+    col_deltas.insert(col_deltas.end(), std::max(2, col_multiple_ + 1), 0);
+    nnz_per_row.insert(nnz_per_row.end(), 2, nnz_per_row.back());
+    weights_ = CacheAlignedVector<WeightType>(weights_csr);
+    col_deltas_ = CacheAlignedVector<DeltaType>(col_deltas);
+    nnz_per_row_ = CacheAlignedVector<int>(nnz_per_row);
+    ComputeRHSIndices();
+    num_threads_ = 0;
+    PrepareForThreads(1);
+  }
+  // Constructor makes a matrix from the given weights, deltas and nnz, taking
+  // the other parameters from |src_matrix|. |cols| is the number of raw columns
+  // (NOT blocks) of the new matrix.
+  CsrBlockSparseMatrix(
+      const CsrBlockSparseMatrix<WeightType, RhsType, DeltaType>& src_matrix,
+      const std::vector<WeightType>& new_weights,
+      const std::vector<DeltaType>& new_deltas, const std::vector<int>& new_nnz,
+      int cols) {
+    num_threads_ = 0;
+    col_multiple_ = src_matrix.col_multiple_;
+    block_width_ = src_matrix.block_width_;
+    block_height_ = src_matrix.block_height_;
+    reduced_rows_ = new_nnz.size();
+    rows_ = reduced_rows_ * block_height_;
+    cols_ = cols;
+    reduced_cols_ = cols_ / block_width_;
+    weights_ = CacheAlignedVector<WeightType>(new_weights);
+    col_deltas_ = CacheAlignedVector<DeltaType>(new_deltas);
+    nnz_per_row_ = CacheAlignedVector<int>(new_nnz);
+    sparsity_ = 1.0f - static_cast<float>(new_weights.size()) / (rows_ * cols_);
+    ComputeRHSIndices();
+    name_ = src_matrix.name_;
+    PrepareForThreads(1);
+  }
+  // Factory method takes a column slice out of *this and returns a sparse
+  // matrix that takes as inputs [|start_col|, |end_col|) of *this, and
+  // returns the same number of outputs, but only a partial result.
+  // If |keep_rhs_size|, then the new matrix takes the same rhs as the current
+  // matrix, but uses a subset of it, instead of expecting just the reduced rhs.
+  // If |start_col| > |end_col|, then we slice out the complement of the defined
+  // interval, ie [0, |end_col|) + [|start_col|, current end).
+  // NOTE That |start_col| and |end_col| are in raw column coordinates, NOT
+  // block units.
+  CsrBlockSparseMatrix SplitByColumn(int start_col, int end_col,
+                                     bool keep_rhs_size = false) const {
+    int weight_index = 0;
+    int delta_index = 0;
+    std::vector<DeltaType> new_deltas;
+    std::vector<WeightType> new_weights;
+    std::vector<int> new_nnz(reduced_rows_);
+    int col = 0;
+    int prev_col = keep_rhs_size ? 0 : start_col;
+    for (int r = 0; r < reduced_rows_; ++r) {
+      int reduced_col_count = nnz_per_row_[r];
+      for (int c = 0; c < reduced_col_count; ++c, ++delta_index) {
+        col += col_deltas_[delta_index] / sizeof(RhsType);
+        if ((start_col < end_col && start_col <= col && col < end_col) ||
+            (start_col > end_col && (col < end_col || col >= start_col))) {
+          ++new_nnz[r];
+          new_deltas.push_back((col - prev_col) * sizeof(RhsType));
+          prev_col = col;
+          for (int i = 0; i < block_width_ * block_height_;
+               ++i, ++weight_index) {
+            new_weights.push_back(weights_[weight_index]);
+          }
+        } else {
+          weight_index += block_width_ * block_height_;
+        }
+      }
+    }
+    int new_cols = keep_rhs_size ? cols_ : end_col - start_col;
+    return CsrBlockSparseMatrix(*this, new_weights, new_deltas, new_nnz,
+                                new_cols);
+  }
+  // Factory method takes a row slice out of *this and returns a sparse
+  // matrix that takes the sampe inputs as *this, and returns the outputs for
+  // the range [|start_row|, |end_row|).
+  // NOTE That |start_row| and |end_row| are in raw column coordinates, NOT
+  // block units.
+  CsrBlockSparseMatrix SplitByRow(int start_row, int end_row) const {
+    int start_reduced = start_row / block_height_;
+    int end_reduced = end_row / block_height_;
+    std::vector<int> new_nnz(nnz_per_row_.data() + start_reduced,
+                             nnz_per_row_.data() + end_reduced);
+    int weight_start = 0;
+    for (int r = 0; r < start_reduced; ++r) {
+      weight_start += nnz_per_row_[r];
+    }
+    int weight_end = weight_start;
+    for (int r = start_reduced; r < end_reduced; ++r) {
+      weight_end += nnz_per_row_[r];
+    }
+    int delta_start = 0;
+    for (int i = 0; i < weight_start; ++i) {
+      delta_start += col_deltas_[i];
+    }
+    std::vector<DeltaType> new_deltas(col_deltas_.data() + weight_start,
+                                      col_deltas_.data() + weight_end);
+    new_deltas[0] += delta_start;
+    int block_size = block_height_ * block_width_;
+    std::vector<WeightType> new_weights(
+        weights_.data() + weight_start * block_size,
+        weights_.data() + weight_end * block_size);
+    return CsrBlockSparseMatrix(*this, new_weights, new_deltas, new_nnz, cols_);
+  }
+  // Combines adjacent row blocks, doubling the block height.
+  // This necessarily involves adding zero weights where the blocks don't align
+  // across adjacent pairs of rows, so use with caution, as the resulting matrix
+  // is most likely to run slower if very sparse to begin with.
+  // In the few cases where the blocks do mostly align, the resulting matmul
+  // could be much faster, as the number of reads of the rhs will be halved.
+  void DoubleBlockHeight() {
+    int new_rows = reduced_rows_ / 2;
+    std::vector<int> new_nnz(new_rows);
+    std::vector<DeltaType> new_rhs_indices;
+    std::vector<WeightType> new_weights;
+    int rhs_index1 = 0;
+    int rhs_index2 = 0;
+    int block_size = block_height_ * block_width_;
+    for (int r = 0; r < new_rows; ++r) {
+      int start_nnz = new_rhs_indices.size();
+      rhs_index2 += nnz_per_row_[r * 2];
+      int end1 = rhs_index1 + nnz_per_row_[r * 2];
+      int end2 = rhs_index2 + nnz_per_row_[r * 2 + 1];
+      // Run over a pair of rows with 2 iterators, combining blocks as we go, or
+      // padding with zeros where the block positions don't match.
+      while (rhs_index1 < end1 || rhs_index2 < end2) {
+        int col1 = rhs_index1 < end1 ? rhs_indices_[rhs_index1] : reduced_cols_;
+        int col2 = rhs_index2 < end2 ? rhs_indices_[rhs_index2] : reduced_cols_;
+        if (col1 < col2) {
+          // Need zero weights for row2 to pad out weights block.
+          new_rhs_indices.push_back(col1);
+          new_weights.insert(new_weights.end(),
+                             weights_.data() + rhs_index1 * block_size,
+                             weights_.data() + (rhs_index1 + 1) * block_size);
+          new_weights.insert(new_weights.end(), block_size,
+                             static_cast<WeightType>(0.0f));
+          ++rhs_index1;
+        } else if (col1 > col2) {
+          // Need zero weights for row1 to pad out weights block.
+          new_rhs_indices.push_back(col2);
+          new_weights.insert(new_weights.end(), block_size,
+                             static_cast<WeightType>(0.0f));
+          new_weights.insert(new_weights.end(),
+                             weights_.data() + rhs_index2 * block_size,
+                             weights_.data() + (rhs_index2 + 1) * block_size);
+          ++rhs_index2;
+        } else {
+          // Combine weights for both row1 and row2.
+          new_rhs_indices.push_back(col1);
+          new_weights.insert(new_weights.end(),
+                             weights_.data() + rhs_index1 * block_size,
+                             weights_.data() + (rhs_index1 + 1) * block_size);
+          new_weights.insert(new_weights.end(),
+                             weights_.data() + rhs_index2 * block_size,
+                             weights_.data() + (rhs_index2 + 1) * block_size);
+          ++rhs_index1;
+          ++rhs_index2;
+        }
+      }
+      rhs_index1 = rhs_index2;
+      new_nnz[r] = new_rhs_indices.size() - start_nnz;
+    }
+    block_height_ *= 2;
+    reduced_rows_ /= 2;
+    weights_ = CacheAlignedVector<WeightType>(new_weights);
+    rhs_indices_ = CacheAlignedVector<DeltaType>(new_rhs_indices);
+    nnz_per_row_ = CacheAlignedVector<int>(new_nnz);
+    sparsity_ = 1.0f - static_cast<float>(new_weights.size()) / (rows_ * cols_);
+    ComputeColDeltas();
+    if (num_threads_ > 0) {
+      int num_threads = num_threads_;
+      num_threads_ = 0;
+      PrepareForThreads(num_threads);
+    }
+  }
+  // Allocates memory and fills buffer.
+  // Caller is responsible for the memory de-allocation.
+  // TODO(b/189958858): Both Read and Write need to eventually handle the
+  // different possible HalfType and DeltaType values, but punting for now as
+  // there is only one supported combination.
+  std::size_t WriteToFlatBuffer(std::string* csr_flatbuffer) {
+    std::size_t bytes = 0;
+    bytes += FixedParameterSize();
+    bytes += weights_.size() * sizeof(WeightType);
+    bytes += col_deltas_.size() * sizeof(DeltaType);
+    bytes += nnz_per_row_.size() * sizeof(int);
+    uint8_t* bytes_ptr_ptr =
+        reinterpret_cast<uint8_t*>(CHECK_NOTNULL(malloc(bytes)));
+    int* int_bytes_ptr = reinterpret_cast<int*>(bytes_ptr_ptr);
+    *int_bytes_ptr++ = rows_;
+    *int_bytes_ptr++ = cols_;
+    *int_bytes_ptr++ = reduced_rows_;
+    *int_bytes_ptr++ = reduced_cols_;
+    *int_bytes_ptr++ = block_width_;
+    *int_bytes_ptr++ = block_height_;
+    *int_bytes_ptr++ = col_multiple_;
+    *int_bytes_ptr++ = num_threads_;
+    *int_bytes_ptr++ = weights_.size();
+    *int_bytes_ptr++ = col_deltas_.size();
+    *int_bytes_ptr++ = nnz_per_row_.size();
+    float* float_bytes_ptr = reinterpret_cast<float*>(int_bytes_ptr);
+    *float_bytes_ptr++ = sparsity_;
+    uint8_t* bytes_ptr = reinterpret_cast<uint8_t*>(float_bytes_ptr);
+    memcpy(bytes_ptr, weights_.data(), weights_.size() * sizeof(WeightType));
+    bytes_ptr += weights_.size() * sizeof(WeightType);
+    memcpy(bytes_ptr, col_deltas_.data(),
+           col_deltas_.size() * sizeof(DeltaType));
+    bytes_ptr += col_deltas_.size() * sizeof(DeltaType);
+    memcpy(bytes_ptr, nnz_per_row_.data(), nnz_per_row_.size() * sizeof(int));
+    bytes_ptr += nnz_per_row_.size() * sizeof(int);
+    csr_flatbuffer->resize(bytes);
+    csr_flatbuffer->assign(reinterpret_cast<char*>(bytes_ptr_ptr), bytes);
+    free(bytes_ptr_ptr);
+    return bytes;
+  }
+  void ReadFromFlatBuffer(const uint8_t* const& bytes, const std::size_t& len) {
+    CHECK_GE(len, FixedParameterSize());
+    const int* int_bytes_ptr = reinterpret_cast<const int*>(bytes);
+    rows_ = *int_bytes_ptr++;
+    cols_ = *int_bytes_ptr++;
+    reduced_rows_ = *int_bytes_ptr++;
+    reduced_cols_ = *int_bytes_ptr++;
+    block_width_ = *int_bytes_ptr++;
+    block_height_ = *int_bytes_ptr++;
+    col_multiple_ = *int_bytes_ptr++;
+    int num_threads = *int_bytes_ptr++;
+    int32_t weights_size = *int_bytes_ptr++;
+    int32_t col_deltas_size = *int_bytes_ptr++;
+    int32_t nnz_per_row_size = *int_bytes_ptr++;
+    // Make sure negative sizes don't mess things up.
+    weights_size = std::max(0, weights_size);
+    col_deltas_size = std::max(0, col_deltas_size);
+    nnz_per_row_size = std::max(0, nnz_per_row_size);
+    const float* float_bytes_ptr =
+        reinterpret_cast<const float*>(int_bytes_ptr);
+    sparsity_ = *float_bytes_ptr++;
+    std::size_t total_bytes =
+        FixedParameterSize() + weights_size * sizeof(WeightType) +
+        col_deltas_size * sizeof(DeltaType) + nnz_per_row_size * sizeof(int);
+    CHECK_EQ(total_bytes, len)
+        << "total bytes: " << total_bytes << ", actual len given: " << len;
+    const uint8_t* bytes_ptr =
+        reinterpret_cast<const uint8_t*>(float_bytes_ptr);
+    std::vector<WeightType> weights_raw(weights_size);
+    memcpy(weights_raw.data(), bytes_ptr, weights_size * sizeof(WeightType));
+    weights_ = CacheAlignedVector<WeightType>(weights_raw);
+    bytes_ptr += weights_size * sizeof(WeightType);
+    std::vector<DeltaType> deltas_raw(col_deltas_size);
+    memcpy(deltas_raw.data(), bytes_ptr, col_deltas_size * sizeof(DeltaType));
+    col_deltas_ = CacheAlignedVector<DeltaType>(deltas_raw);
+    bytes_ptr += col_deltas_size * sizeof(DeltaType);
+    std::vector<int> nnz_raw(nnz_per_row_size);
+    memcpy(nnz_raw.data(), bytes_ptr, nnz_per_row_size * sizeof(int));
+    nnz_per_row_ = CacheAlignedVector<int>(nnz_raw);
+    num_threads_ = 0;
+    PrepareForThreads(num_threads);
+  }
+  // Multiply a Sparse matrix by a possibly dense matrix.  Often the matrix is
+  // a vector with a small number of columns, hence the term "fat vector".
+  // 1x1 and 4x4 have specializations for output columns (ie fatness) > 5,
+  // and often achieve twice as many GFlops when multiplying a right hand side
+  // that has 5 or more columns.  (Best is a multiple of 5).
+  // 16x1 doesn't have enough registers and just loops over the width 1 kernel.
+  //
+  // |rhs| and |out| are COLUMN MAJOR.
+  // Fast Tuples WeightType, BiasType, RhsType, OutType are:
+  // (float, float, float, float)
+  // (bfloat16, float, float, float)
+  // and only on ARM64.  All other cases use a slow generic implementation.
+  template <typename RhsClass, typename BiasClass, typename OutClass,
+            typename BiasType = typename BiasClass::value_type,
+            typename OutType = typename OutClass::value_type>
+  void SpMM_bias(const RhsClass& rhs, const BiasClass& bias, OutClass* out,
+                 bool relu = false, int tid = 0,
+                 SpinBarrier* barrier = nullptr) const {
+    static_assert(std::is_same<typename RhsClass::value_type, RhsType>::value,
+                  "Rhs types must match");
+    CHECK_LT(tid, num_threads_);
+    CHECK_EQ(rhs.cols(), out->cols());
+    CHECK_EQ(rhs.rows(), cols_);
+    CHECK_GE(out->rows(), rows_);
+    int cols_to_go = out->cols();
+    int rhs_index = *thread_bounds_.OffsetRhsIndices(rhs_indices_.data(), tid);
+    const RhsType* rhs_ptr = rhs.data() + rhs_index * block_height_;
+    OutType* out_ptr = thread_bounds_.OffsetOutput(out->data(), tid);
+    const WeightType* weights_ptr =
+        thread_bounds_.OffsetWeights(weights_.data(), tid);
+    const DeltaType* delta_ptr =
+        thread_bounds_.OffsetRhsIndices(col_deltas_.data(), tid);
+    int offset = *delta_ptr / sizeof(RhsType);
+    rhs_ptr -= offset;
+    const int* nnz_ptr = nnz_per_row_.data() + thread_bounds_.StartRow(tid);
+    int assigned_rows =
+        thread_bounds_.StartRow(tid + 1) - thread_bounds_.StartRow(tid);
+    const BiasType* bias_ptr = thread_bounds_.OffsetBias(bias.data(), tid);
+    while (cols_to_go > 0) {
+      if (block_width_ == 4 && block_height_ == 4) {
+        if (cols_to_go >= 5) {
+          detail::SpMM5_4x4<WeightType, RhsType, OutType>(
+              weights_ptr, delta_ptr, nnz_ptr, rhs_ptr, bias_ptr, out_ptr,
+              assigned_rows, out->col_stride(), rhs.col_stride(), relu);
+        } else {
+          detail::SpMV_4x4<WeightType, RhsType, OutType>(
+              weights_ptr, delta_ptr, nnz_ptr, rhs_ptr, bias_ptr, out_ptr,
+              assigned_rows, out->col_stride(), rhs.col_stride(), relu);
+        }
+      } else {
+        if (cols_to_go >= 5) {
+          detail::SpMM5_1x1<WeightType, RhsType, OutType>(
+              weights_ptr, delta_ptr, nnz_ptr, rhs_ptr, bias_ptr, out_ptr,
+              assigned_rows, out->col_stride(), rhs.col_stride(), relu);
+        } else {
+          detail::SpMV_1x1<WeightType, RhsType, OutType>(
+              weights_ptr, delta_ptr, nnz_ptr, rhs_ptr, bias_ptr, out_ptr,
+              assigned_rows, out->col_stride(), rhs.col_stride(), relu);
+        }
+      }
+      if (cols_to_go >= 5) {
+        cols_to_go -= 5;
+        rhs_ptr += rhs.col_stride() * 5;
+        out_ptr += out->col_stride() * 5;
+      } else {
+        cols_to_go--;
+        rhs_ptr += rhs.col_stride();
+        out_ptr += out->col_stride();
+      }
+      if (barrier) barrier->barrier();
+    }
+  }
+  template <typename MVRhsType, typename MVBiasType, typename OutType>
+  void MatVec(const MVRhsType* rhs, const MVBiasType* bias, bool relu, int tid,
+              int replicas, int output_stride, OutType* output) {
+    CHECK_LT(tid, num_threads_);
+    CHECK_EQ(block_width_, 4) << "Block width must be 4!";
+    if (block_height_ == 8) {
+      matmul_.MatVec8x4(
+          thread_bounds_.OffsetWeights(weights_.cast_data(), tid), rhs,
+          thread_bounds_.OffsetBias(bias, tid), nnz_per_row_.data(),
+          thread_bounds_.OffsetRhsIndices(rhs_indices_.data(), tid),
+          thread_bounds_.StartRow(tid), thread_bounds_.StartRow(tid + 1), relu,
+          replicas, output_stride, thread_bounds_.OffsetOutput(output, tid));
+    } else {
+      CHECK_EQ(block_height_, 4) << "Block height must be 4 or 8!";
+      matmul_.MatVec4x4(
+          thread_bounds_.OffsetWeights(weights_.cast_data(), tid), rhs,
+          thread_bounds_.OffsetBias(bias, tid), nnz_per_row_.data(),
+          thread_bounds_.OffsetRhsIndices(rhs_indices_.data(), tid),
+          thread_bounds_.StartRow(tid), thread_bounds_.StartRow(tid + 1), relu,
+          replicas, output_stride, thread_bounds_.OffsetOutput(output, tid));
+    }
+  }
+  int rows() const { return rows_; }
+  int cols() const { return cols_; }
+  int block_height() const { return block_height_; }
+  int block_width() const { return block_width_; }
+  float sparsity() const { return sparsity_; }
+  int num_threads() const { return num_threads_; }
+  const ThreadBounds& thread_bounds() const { return thread_bounds_; }
+  const CacheAlignedVector<DeltaType>& rhs_indices() const {
+    return rhs_indices_;
+  }
+  const std::string& name() const { return name_; }
+  void set_name(const std::string& name) { name_ = name; }
+  const std::vector<int>& split_points() const {
+    return thread_bounds_.row_starts();
+  }
+  std::size_t bytes() const {
+    return weights_.size() * sizeof(WeightType) +
+           col_deltas_.size() * sizeof(DeltaType) +
+           nnz_per_row_.size() * sizeof(int);
+  }
+  // Multiplies a sparse matrix by a possibly dense matrix, as SpMM_bias above,
+  // and then samples from the output (softmax distribution) layer.
+  template <typename RhsClass, typename BiasClass, typename OutClass,
+            typename BiasType = typename BiasClass::value_type,
+            typename OutType = typename OutClass::value_type>
+  typename std::enable_if<!IsFixed32Type<OutType>::value, int>::type
+  SpMM_bias_Sample(const RhsClass& rhs, const BiasClass& bias, OutClass* out,
+                   float temperature, int tid, SpinBarrier* barrier,
+                   std::minstd_rand* gen,
+                   CacheAlignedVector<float>* scratch) const {
+    SpMM_bias(rhs, bias, out, /*relu=*/false, tid, barrier);
+    return out->Sample(temperature, gen, scratch);
+  }
+  // Fixed32 version.
+  template <typename RhsClass, typename BiasClass, typename OutClass,
+            typename BiasType = typename BiasClass::value_type,
+            typename OutType = typename OutClass::value_type>
+  typename std::enable_if<IsFixed32Type<OutType>::value, int>::type
+  SpMM_bias_Sample(const RhsClass& rhs, const BiasClass& bias, OutClass* out,
+                   float temperature, int tid, SpinBarrier* barrier,
+                   std::minstd_rand* gen,
+                   CacheAlignedVector<float>* scratch) const {
+    // We don't pass the barrier on, as we have more work to do.
+    SpMM_bias(rhs, bias, out, /*relu=*/false, tid);
+    return out->ReducingSample(gen, scratch, tid, temperature, barrier);
+  }
+  void Print() const {
+    std::cout << "Weights\n";
+    weights_.Print();
+    std::cout << std::endl;
+    std::cout << "Deltas\n";
+    col_deltas_.Print();
+    std::cout << std::endl;
+    std::cout << "nnz\n";
+    nnz_per_row_.Print();
+    std::cout << std::endl;
+  }
+  // Split the computation amongst threads by rows based on the number of
+  // non zeros, with the addition of a constant to account for the work of the
+  // bias and the horizontal add at the end, and also guarantees that each
+  // thread writes only whole cache lines, based on the size of OutType.
+  // The |cache_line_size| arg is used only for testing. Normally it is provided
+  // through the architecture #defines.
+  // Each thread gets a contiguous row range (|split_points|).
+  // Thread t does rows [ split_points[t], split_points[t + 1] )
+  // Each thread also needs to know how many non zeros were before it to skip
+  // (|nnz_to_skip|).  And finally it also needs to know what the offset into
+  // the rhs vector would have been at the split point (|rhs_to_skip|).
+  //
+  // Some tricky corner cases where the number of non-zeros doesn't split
+  // nicely amongst the number of requested threads are not handled and default
+  // to one thread; these cases are only going to happen in tests and not in
+  // the matrices that correspond in real models.
+  //
+  // Returns the maximum number of threads that can be used; <= |num_threads|.
+  template <typename OutType = int32_t>
+  int PrepareForThreads(int num_threads, int cache_line_size = -1) {
+    CHECK_GT(num_threads, 0);
+    // we've already prepared for this number of threads, nothing to do
+    if (num_threads == num_threads_) return num_threads_;
+    num_threads_ = num_threads;
+    thread_bounds_.PrepareForThreads(
+        block_width_, block_height_, num_threads_,
+        ReducedRowsPerCacheLine<OutType>(cache_line_size), reduced_rows_,
+        nnz_per_row_.data());
+    return num_threads_;
+  }
+  // Computes and stores the |rhs_indices_| from the |col_deltas_|.
+  void ComputeRHSIndices() {
+    std::vector<int> cumulative_deltas = CumulativeColDeltas();
+    std::vector<DeltaType> rhs_indices(cumulative_deltas.size() +
+                                       reduced_rows_);
+    int total_indices = 0;
+    int delta_index = 0;
+    for (int r = 0; r < reduced_rows_; ++r) {
+      for (int n = 0; n < nnz_per_row_[r]; ++n, ++delta_index) {
+        rhs_indices[total_indices++] =
+            cumulative_deltas[delta_index] / block_width_;
+      }
+    }
+    rhs_indices_ = CacheAlignedVector<DeltaType>(rhs_indices);
+  }
+  // Computes and stores the |col_deltas_| from the |rhs_indices_|.
+  void ComputeColDeltas() {
+    std::vector<int> col_deltas(rhs_indices_.size());
+    int prev_index = 0;
+    for (int i = 0; i < rhs_indices_.size(); ++i) {
+      int offset = rhs_indices_[i] - prev_index;
+      prev_index = rhs_indices_[i];
+      col_deltas[i] = offset * block_width_ * sizeof(RhsType);
+    }
+    col_deltas_ = CacheAlignedVector<DeltaType>(col_deltas);
+  }
+  // Computes and returns the inclusive prefix sum of the deltas, ie absolute
+  // positions.
+  std::vector<int> CumulativeColDeltas() const {
+    std::vector<int> cum_col_deltas(col_deltas_.size());
+    for (int i = 0; i < col_deltas_.size(); ++i) {
+      cum_col_deltas[i] = col_deltas_[i] / sizeof(RhsType);
+      if (i > 0) cum_col_deltas[i] += cum_col_deltas[i - 1];
+    }
+    return cum_col_deltas;
+  }
+ private:
+  constexpr std::size_t FixedParameterSize() const {
+    return sizeof(int)      // rows
+           + sizeof(int)    // cols
+           + sizeof(int)    // reduced_rows
+           + sizeof(int)    // reduced_cols
+           + sizeof(int)    // block_width
+           + sizeof(int)    // block_height
+           + sizeof(float)  // sparsity
+           + sizeof(int)    // col_multiple
+           + sizeof(int)    // num_threads_
+           + sizeof(int)    // weights_.size()
+           + sizeof(int)    // col_deltas_.size()
+           + sizeof(int);   // nnz_per_row_.size()
+  }
+  // Possible block sizes are only those that are supported by the computation
+  // default is 1x1, other options are 4x4 and 16x1.
+  template <typename InputType>
+  void DetermineBlockSize(const MaskedSparseMatrix<InputType>& masked_matrix) {
+    const std::vector<std::pair<int, int>> kPreferredOrder = {{4, 4}};
+    int rows = masked_matrix.rows();
+    int cols = masked_matrix.cols();
+    for (const auto& block_size : kPreferredOrder) {
+      int block_height, block_width;
+      std::tie(block_height, block_width) = block_size;
+      if (cols % block_width != 0) continue;
+      int reduced_rows = (rows + block_height - 1) / block_height;
+      int reduced_cols = cols / block_width;
+      // For each possible block, confirm that it is either all 0s or all 1s.
+      bool all_same = true;
+      const auto& mask = masked_matrix.mask();
+      for (int r = 0; r < reduced_rows; ++r) {
+        for (int c = 0; c < reduced_cols; ++c) {
+          int val = mask[r * block_height * cols + c * block_width];
+          for (int i = 0; i < block_height; ++i) {
+            for (int j = 0; j < block_width; ++j) {
+              int index = (r * block_height + i) * cols + c * block_width + j;
+              if (index < masked_matrix.mask().size()) {
+                all_same &= (masked_matrix.mask()[index] == val);
+              }
+            }
+          }
+        }
+      }
+      // If this block configuration is possible, accept it.
+      if (all_same) {
+        block_height_ = block_height;
+        block_width_ = block_width;
+        return;
+      }
+    }
+    // No large blocks were found, default to 1x1.
+    block_height_ = 1;
+    block_width_ = 1;
+  }
+  // CSR descriptors are for the reduced matrix, weights is the full matrix.
+  template <typename InputType>
+  void MakeColumnsMultiple(const std::vector<int>& row_offsets,
+                           std::vector<int>* reduced_mask,
+                           std::vector<InputType>* weights) {
+    if (col_multiple_ > 0) {
+      // Make sure each row has a number of columns that is a multiple of
+      // |col_multiple|.
+      for (int r = 1; r < row_offsets.size(); ++r) {
+        int num_row = row_offsets[r] - row_offsets[r - 1];
+        int num_needed = col_multiple_ - num_row % col_multiple_;
+        if (num_needed < col_multiple_) {
+          // Find gaps in the columns where we can insert a column of 0 weights.
+          int num_added = 0;
+          for (int c = 0; c < reduced_cols_; ++c) {
+            if ((*reduced_mask)[(r - 1) * reduced_cols_ + c] == 0) {
+              (*reduced_mask)[(r - 1) * reduced_cols_ + c] = 1;
+              // Zero out the weights that correspond to this block.
+              for (int i = 0; i < block_height_; ++i) {
+                for (int j = 0; j < block_width_; ++j) {
+                  (*weights)[((r - 1) * block_height_ + i) * cols_ +
+                             block_width_ * c + j] = InputType(0.f);
+                }
+              }
+              num_added++;
+            }
+            if (num_added == num_needed) break;
+          }
+        }
+      }
+    }
+  }
+  // Given the final dense mask and weights, convert to the compressed
+  // block CSR representation.
+  template <typename InputType>
+  void MaskAndWeightsToCsr(const std::vector<int>& mask,
+                           const std::vector<InputType>& weights,
+                           std::vector<int>* nnz_per_row,
+                           std::vector<int>* col_indices,
+                           std::vector<WeightType>* weights_csr) {
+    std::vector<int> row_offsets = {0};
+    int nnz = 0;
+    // Standard CSR format.
+    if (block_width_ == 1 && block_height_ == 1) {
+      for (int r = 0; r < rows_; ++r) {
+        for (int c = 0; c < cols_; ++c) {
+          if (mask[r * cols_ + c] == 1) {
+            nnz++;
+            col_indices->push_back(c);
+            weights_csr->push_back(WeightType(weights[r * cols_ + c]));
+          }
+        }
+        row_offsets.push_back(nnz);
+      }
+    } else if (block_width_ == 4 && block_height_ == 4) {
+      // Weights are stored contiguously for each block in this case.
+      for (int r = 0; r < reduced_rows_; ++r) {
+        for (int c = 0; c < reduced_cols_; ++c) {
+          if (mask[r * reduced_cols_ + c] == 1) {
+            col_indices->push_back(c);
+            nnz++;
+            for (int i = 0; i < block_height_; ++i) {
+              for (int j = 0; j < block_width_; ++j) {
+                int row_index = (block_height_ * r + i) * cols_;
+                int w_index = row_index + block_width_ * c + j;
+                WeightType weight = w_index < weights.size()
+                                        ? WeightType(weights[w_index])
+                                        : WeightType(0.0f);
+                weights_csr->push_back(weight);
+              }
+            }
+          }
+        }
+        row_offsets.push_back(nnz);
+      }
+    }
+    for (int i = 1; i < row_offsets.size(); ++i)
+      nnz_per_row->push_back(row_offsets[i] - row_offsets[i - 1]);
+  }
+  // Returns the number of block rows per cache line. This is the minimum unit
+  // into which the calculation is broken for threads.
+  template <typename OutType>
+  int ReducedRowsPerCacheLine(int override_cache_line_size = -1) const {
+    int line_size = kCacheLineSize;
+    if (override_cache_line_size >= 1) line_size = override_cache_line_size;
+    return std::max<int>(line_size / (block_height_ * sizeof(OutType)), 1);
+  }
+  int col_multiple_;
+  int rows_;
+  int cols_;
+  int reduced_rows_;
+  int reduced_cols_;
+  float sparsity_;
+  int block_width_;
+  int block_height_;
+  int num_threads_;
+  std::string name_;
+  CacheAlignedVector<WeightType> weights_;
+  CacheAlignedVector<DeltaType> col_deltas_;
+  CacheAlignedVector<int> nnz_per_row_;
+  // |thread_bounds_| and |rhs_indices_| don't need to be serialized as they are
+  // always recalculated from serialized data.
+  CacheAlignedVector<DeltaType> rhs_indices_;
+  Matmul<WeightType, RhsType> matmul_;
+  ThreadBounds thread_bounds_;
+  static constexpr int kCacheLineSize = 64;
+};
+// Converts a sparse matrix represented with (|mask|, |weights|, |size|) into
+// the CSR format, and returns that as a serialized string.
+template <typename MaskType>
+std::string ConvertDenseToSparseRepresentation_Int16Deltas(
+    const std::vector<MaskType>& mask, const std::vector<float>& weights,
+    const int rows, const int cols) {
+  MaskedSparseMatrix<float> masked_weights(rows, cols, mask.data(),
+                                           weights.data());
+  CsrBlockSparseMatrix<csrblocksparse::bfloat16, float, int16_t>
+      sparse_masked_weights(masked_weights);
+  std::string buffer;
+  sparse_masked_weights.WriteToFlatBuffer(&buffer);
+  return buffer;
+}
+}  // namespace csrblocksparse
+#endif  // LYRA_CODEC_SPARSE_MATMUL_LAYERS_CSR_BLOCKSPARSE_MATRIX_H_

sparse_matmul/layers/csrblocksparse_test.cc ADDED Viewed

	@@ -0,0 +1,977 @@

+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <array>
+#include <cstdint>
+#include <tuple>
+#include <vector>
+// Placeholder for get runfiles header.
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "gtest/gtest.h"
+#include "include/ghc/filesystem.hpp"
+#include "sparse_matmul/compute/matmul.h"
+#include "sparse_matmul/layers/utils.h"
+#include "sparse_matmul/numerics/test_utils.h"
+#include "sparse_matmul/os/coop_threads.h"
+namespace csrblocksparse {
+namespace {
+inline constexpr absl::string_view kTestdataPath = "layers/testdata";
+TEST(CSRBlockSparseMatrix, FlatBufferSerialization) {
+  const int kRows = 8;
+  const int kCols = 8;
+  std::vector<int> mask = {1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0,
+                           1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0,
+                           0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1,
+                           0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1};
+  std::vector<float> values(kRows * kCols, 1.f);
+  values[1] = 2.f;
+  values[3] = 3.f;
+  values[36] = -1.f;
+  values[45] = -2.f;
+  csrblocksparse::CacheAlignedVector<float> bias(kRows);
+  csrblocksparse::CacheAlignedVector<float> rhs(kCols);
+  csrblocksparse::CacheAlignedVector<float> out_ref(kRows);
+  csrblocksparse::CacheAlignedVector<float> out_test(kRows);
+  bias.FillZero();
+  rhs.FillOnes();
+  csrblocksparse::MaskedSparseMatrix<float> matrix(kRows, kCols, mask.data(),
+                                                   values.data());
+  matrix.SpMM_bias(rhs, bias, &out_ref);
+  csrblocksparse::CsrBlockSparseMatrix<csrblocksparse::bfloat16, float, int16_t>
+      block_sparse_matrix(matrix);
+  std::string buffer;
+  std::size_t num_bytes = block_sparse_matrix.WriteToFlatBuffer(&buffer);
+  csrblocksparse::CsrBlockSparseMatrix<csrblocksparse::bfloat16, float, int16_t>
+      new_block_sparse_matrix(reinterpret_cast<const uint8_t*>(buffer.c_str()),
+                              num_bytes);
+  new_block_sparse_matrix.SpMM_bias(rhs, bias, &out_test);
+  CheckResult(out_ref, out_test, kCols);
+}
+template <typename ComputeType, typename RhsType, typename OutType>
+void CorrectnessCheckBlockSpMM(int rows, int cols, int block_height,
+                               int block_width, float sparsity,
+                               bool use_relu = false, int num_threads = 1,
+                               int fatness = 1, bool test_matmul = false) {
+  using BiasType = typename TypeOfProduct<ComputeType, RhsType>::type;
+  MaskedSparseMatrix<float> matrix(rows, cols, sparsity, block_height,
+                                   block_width);
+  matrix.CastWeights<ComputeType>();
+  FatCacheAlignedVector<RhsType> rhs(cols, fatness);
+  CacheAlignedVector<BiasType> bias(rows);
+  FatCacheAlignedVector<OutType> out(rows, fatness);
+  bias.FillRandom();
+  rhs.FillRandom();
+  out.FillZero();
+  FatCacheAlignedVector<OutType> out_reference = out;
+  matrix.SpMM_bias(rhs, bias, &out_reference, use_relu);
+  CsrBlockSparseMatrix<ComputeType, RhsType> sparse_matrix(matrix);
+  SparseLinearLayer<ComputeType, RhsType> sparse_linear_layer(
+      std::move(sparse_matrix), std::move(bias));
+  num_threads = sparse_linear_layer.PrepareForThreads(num_threads);
+  // Checks that the result of applying each thread's portion serially is
+  // correct.
+  for (int thread_id = 0; thread_id < num_threads; ++thread_id) {
+    sparse_linear_layer.SpMM_bias(rhs, &out, use_relu, thread_id);
+  }
+  CheckResult(out_reference, out, sparse_linear_layer.cols());
+  if (test_matmul) {
+    for (int thread_id = 0; thread_id < num_threads; ++thread_id) {
+      sparse_linear_layer.MatVec(rhs, use_relu, thread_id,
+                                 /*replicas=*/1, /*output_stride=*/0, &out);
+    }
+    CheckResult(out_reference, out, sparse_linear_layer.cols());
+  }
+}
+// Does:
+// y = Ax + b;
+// x = Ay + b;
+// y = Ax + b;
+//
+// to make sure that dependent multiplies are correct.
+template <typename ComputeType, typename RhsType, typename OutType>
+void ThreadBody(
+    SpinBarrier* spin_barrier, int tid,
+    const SparseLinearLayer<ComputeType, RhsType>& sparse_linear_layer,
+    FatCacheAlignedVector<RhsType>* rhs, FatCacheAlignedVector<OutType>* out,
+    bool use_relu) {
+  sparse_linear_layer.SpMM_bias(*rhs, out, use_relu, tid);
+  spin_barrier->barrier();
+  sparse_linear_layer.SpMM_bias(*out, rhs, use_relu, tid);
+  spin_barrier->barrier();
+  sparse_linear_layer.SpMM_bias(*rhs, out, use_relu, tid);
+}
+template <typename ComputeType, typename RhsType, typename OutType>
+void CorrectnessCheckBlockSpMM_MultiThread(int rows, int cols, int block_height,
+                                           int block_width, float sparsity,
+                                           bool use_relu = false,
+                                           int num_threads = 1,
+                                           int fatness = 1) {
+  typedef typename TypeOfProduct<ComputeType, RhsType>::type BiasType;
+  CHECK(rows == cols);
+  MaskedSparseMatrix<float> matrix(rows, cols, sparsity, block_height,
+                                   block_width);
+  matrix.CastWeights<ComputeType>();
+  FatCacheAlignedVector<RhsType> rhs(cols, fatness);
+  FatCacheAlignedVector<RhsType> rhs_mt(cols, fatness);
+  CacheAlignedVector<BiasType> bias(rows);
+  FatCacheAlignedVector<OutType> out(rows, fatness);
+  bias.FillOnes();
+  rhs.FillOnes();
+  rhs_mt.FillOnes();
+  out.FillZero();
+  FatCacheAlignedVector<OutType> out_reference = out;
+  matrix.SpMM_bias(rhs, bias, &out_reference, use_relu);
+  matrix.SpMM_bias(out_reference, bias, &rhs, use_relu);
+  matrix.SpMM_bias(rhs, bias, &out_reference, use_relu);
+  CsrBlockSparseMatrix<ComputeType, RhsType> sparse_matrix(matrix);
+  num_threads = sparse_matrix.PrepareForThreads(num_threads,
+                                                /*cache_line_size=*/1);
+  SparseLinearLayer<ComputeType, RhsType> sparse_linear_layer(
+      std::move(sparse_matrix), std::move(bias));
+  csrblocksparse::LaunchOnThreadsWithBarrier(
+      num_threads, ThreadBody<ComputeType, RhsType, OutType>,
+      sparse_linear_layer, &rhs_mt, &out, use_relu);
+  CheckResult(out_reference, out, cols);
+}
+}  // namespace
+TEST(MaskedSparseCorrectness, HandCoded) {
+  const int kRows = 8;
+  const int kCols = 8;
+  // clang-format off
+  std::vector<int> mask = {1, 1, 0, 0, 0, 1, 1, 1,
+                           0, 1, 0, 1, 0, 1, 0, 1,
+                           1, 0, 0, 1, 1, 1, 1, 0,
+                           0, 0, 0, 0, 0, 0, 0, 0,
+                           1, 1, 1, 1, 1, 1, 1, 1,
+                           0, 0, 0, 0, 1, 1, 0, 0,
+                           1, 1, 0, 0, 1, 1, 0, 0,
+                           1, 0, 0, 0, 0, 1, 0, 1};
+  // clang-format on
+  std::vector<float> values(kRows * kCols, 1.f);
+  std::vector<float> answer = {6.f, 5.f, 6.f, 1.f, 9.f, 3.f, 5.f, 4.f};
+  MaskedSparseMatrix<float> matrix(kRows, kCols, mask.data(), values.data());
+  CacheAlignedVector<float> rhs(kCols);
+  CacheAlignedVector<float> bias(kRows);
+  CacheAlignedVector<float> out(kRows);
+  bias.FillOnes();
+  rhs.FillOnes();
+  out.FillZero();
+  MaskedLinearLayer<float> masked_linear_layer(std::move(matrix),
+                                               std::move(bias));
+  masked_linear_layer.SpMM_bias(rhs, &out);
+  for (int i = 0; i < kRows; ++i) {
+    EXPECT_EQ(answer[i], out[i]);
+  }
+}
+TEST(MaskedSparseCorrectness, HandCodedFatVector) {
+  const int kRows = 8;
+  const int kCols = 8;
+  // clang-format off
+  std::vector<int> mask = {1, 1, 0, 0, 0, 1, 1, 1,
+                           0, 1, 0, 1, 0, 1, 0, 1,
+                           1, 0, 0, 1, 1, 1, 1, 0,
+                           0, 0, 0, 0, 0, 0, 0, 0,
+                           1, 1, 1, 1, 1, 1, 1, 1,
+                           0, 0, 0, 0, 1, 1, 0, 0,
+                           1, 1, 0, 0, 1, 1, 0, 0,
+                           1, 0, 0, 0, 0, 1, 0, 1};
+  // clang-format on
+  std::vector<float> values(kRows * kCols, 1.f);
+  std::vector<float> answer = {6.f, 5.f, 6.f, 1.f, 9.f, 3.f, 5.f, 4.f};
+  MaskedSparseMatrix<float> matrix(kRows, kCols, mask.data(), values.data());
+  const int kMaxWidth = 5;
+  for (int width = 5; width <= kMaxWidth; ++width) {
+    FatCacheAlignedVector<float> rhs(kCols, width);
+    CacheAlignedVector<float> bias(kRows);
+    FatCacheAlignedVector<float> out(kRows, width);
+    bias.FillOnes();
+    rhs.FillOnes();
+    out.FillZero();
+    MaskedLinearLayer<float> masked_linear_layer(std::move(matrix),
+                                                 std::move(bias));
+    masked_linear_layer.SpMM_bias(rhs, &out);
+    for (int i = 0; i < kRows; ++i) {
+      for (int width = 0; width < kMaxWidth; ++width) {
+        EXPECT_EQ(answer[i], out[i + width * kRows]);
+      }
+    }
+  }
+}
+TEST(CsrBlockSparseMatrix, HandCodedMultiThread) {
+  const int kRows = 8;
+  const int kCols = 8;
+  // clang-format off
+  std::vector<int> mask = {1, 1, 0, 0, 0, 1, 1, 1,
+                           0, 1, 0, 1, 0, 1, 0, 1,
+                           1, 0, 0, 1, 1, 1, 1, 0,
+                           0, 0, 0, 0, 0, 0, 0, 0,
+                           1, 1, 1, 1, 1, 1, 1, 1,
+                           0, 0, 0, 0, 1, 1, 0, 0,
+                           1, 1, 0, 0, 1, 1, 0, 0,
+                           1, 0, 0, 0, 0, 1, 0, 1};
+  // clang-format on
+  std::vector<float> values(kRows * kCols, 1.f);
+  std::vector<float> answer = {6.f, 5.f, 6.f, 1.f, 9.f, 3.f, 5.f, 4.f};
+  MaskedSparseMatrix<float> matrix(kRows, kCols, mask.data(), values.data());
+  CacheAlignedVector<float> rhs(kCols);
+  CacheAlignedVector<float> bias(kRows);
+  CacheAlignedVector<float> out(kRows);
+  bias.FillOnes();
+  rhs.FillOnes();
+  out.FillZero();
+  CacheAlignedVector<float> bias_csr = bias;
+  CsrBlockSparseMatrix<bfloat16, float> sparse_matrix(matrix);
+  MaskedLinearLayer<float> masked_linear_layer(std::move(matrix),
+                                               std::move(bias));
+  masked_linear_layer.SpMM_bias(rhs, &out);
+  SparseLinearLayer<bfloat16, float> sparse_linear_layer(
+      std::move(sparse_matrix), std::move(bias_csr));
+  sparse_linear_layer.PrepareForThreads(2, /*cache_line_size=*/1);
+  CacheAlignedVector<float> out_tmp(kRows);
+  const bool kUseRelu = false;
+  sparse_linear_layer.SpMM_bias(rhs, &out_tmp, kUseRelu, /*tid=*/0);
+  sparse_linear_layer.SpMM_bias(rhs, &out_tmp, kUseRelu, /*tid=*/1);
+  for (int i = 0; i < kRows; ++i) {
+    EXPECT_EQ(answer[i], out_tmp[i]);
+  }
+}
+TEST(TestCasts, TestBfloat16) {
+  const int kRows = 1000;
+  const int kCols = 100;
+  const float kSparsity = 0.f;
+  MaskedSparseMatrix<float> matrix(kRows, kCols, kSparsity);
+  MaskedSparseMatrix<float> matrix_bfloat16(kRows, kCols, matrix.mask().data(),
+                                            matrix.values().data());
+  matrix_bfloat16.CastWeights<bfloat16>();
+  CheckResult(matrix.values(), matrix_bfloat16.values(), kCols);
+}
+TEST(TestCasts, TestFP16) {
+  const int kRows = 1000;
+  const int kCols = 100;
+  const float kSparsity = 0.f;
+  MaskedSparseMatrix<float> matrix(kRows, kCols, kSparsity);
+#if !defined __arm__ && !defined __aarch64__
+  // Conversion doesn't handle denormals, so flush denormals to zero first.
+  for (int i = 0; i < matrix.values().size(); ++i) {
+    if (matrix.data()[i] < 1. / static_cast<float>(1 << 14))
+      matrix.data()[i] = 0.f;
+  }
+#endif
+  MaskedSparseMatrix<float> matrix_fp16(kRows, kCols, matrix.mask().data(),
+                                        matrix.values().data());
+  matrix_fp16.CastWeights<csrblocksparse::fp16>();
+  CheckResult(matrix.values(), matrix_fp16.values(), kCols);
+}
+TEST(TestCasts, TestFixed16) {
+  const int kRows = 100000;
+  const int kCols = 1;
+  const float kSparsity = 0.f;
+  MaskedSparseMatrix<float> matrix(kRows, kCols, kSparsity);
+  // Relative error for fixed point is high near 0.
+  for (int i = 0; i < matrix.values().size(); ++i) {
+    // 1.1e-3 is based on the max error of .013 and a grid spacing of 1 / 2**16
+    // == 3e-5.  3e-5 / .013 / 2 = 1.1e-3.
+    if (std::abs(matrix.data()[i]) < 1.1e-3) {
+      matrix.data()[i] = 0.f;
+    }
+  }
+  MaskedSparseMatrix<float> matrix_fixed16 = matrix;
+  matrix_fixed16.CastWeights<csrblocksparse::fixed16</*ExponentBits=*/0>>();
+  CheckResult(matrix.values(), matrix_fixed16.values(), kCols);
+}
+TEST(TestCasts, TestFixed32) {
+  const int kRows = 100000;
+  const int kCols = 1;
+  const float kSparsity = 0.f;
+  MaskedSparseMatrix<float> matrix(kRows, kCols, kSparsity);
+  MaskedSparseMatrix<float> matrix_fixed32 = matrix;
+  matrix_fixed32.CastWeights<csrblocksparse::fixed32</*ExponentBits=*/0>>();
+  CheckResult(matrix.values(), matrix_fixed32.values(), kCols);
+}
+template <typename ComputeType, typename RhsType, typename OutType>
+void TestSpMM(int block_width, int block_height, int fatness,
+              bool test_matmul = false) {
+  std::array<bool, 2> use_relu = {false, true};
+  std::vector<float> sparsity_levels = {.5, .8, .9, .95, .98};
+  std::vector<std::pair<int, int>> sizes = {{8, 8},     {128, 128}, {128, 64},
+                                            {256, 192}, {512, 512}, {1024, 512},
+                                            {384, 384}, {512, 384}};
+  for (int num_threads = 1; num_threads < 2 + test_matmul; ++num_threads) {
+    for (const auto& relu : use_relu) {
+      for (const auto& sparsity : sparsity_levels) {
+        for (const auto& size : sizes) {
+          int rows, cols;
+          std::tie(rows, cols) = size;
+          CorrectnessCheckBlockSpMM<ComputeType, RhsType, OutType>(
+              rows, cols, block_height, block_width, sparsity, relu,
+              num_threads, fatness, test_matmul);
+        }
+      }
+    }
+  }
+}
+template <typename ComputeType, typename RhsType, typename OutType>
+void TestSpMM_MultiThread(int block_width, int block_height, int fatness) {
+  std::array<bool, 2> use_relu = {false, true};
+  std::vector<float> sparsity_levels = {.5, .8, .9, .95, .98};
+  std::vector<std::pair<int, int>> sizes = {
+      {48, 48}, {128, 128}, {512, 512}, {384, 384}};
+  for (int num_threads = 1; num_threads < 5; ++num_threads) {
+    for (const auto& relu : use_relu) {
+      for (const auto& sparsity : sparsity_levels) {
+        for (const auto& size : sizes) {
+          int rows, cols;
+          std::tie(rows, cols) = size;
+          CorrectnessCheckBlockSpMM_MultiThread<ComputeType, RhsType, OutType>(
+              rows, cols, block_height, block_width, sparsity, relu,
+              num_threads, fatness);
+        }
+      }
+    }
+  }
+}
+template <typename DataType>
+void TestSumVectors(int start = 0, int end = -1, int size = 6) {
+  std::vector<DataType> values;
+  std::vector<DataType> answer;
+  for (int i = 1; i < size + 1; ++i) {
+    const float x = static_cast<float>(i);
+    values.push_back(static_cast<DataType>(x));
+    answer.push_back(static_cast<DataType>(x * 2));
+  }
+  if (end == -1) {
+    end = values.size();
+  }
+  csrblocksparse::CacheAlignedVector<DataType> result(values.size());
+  csrblocksparse::CacheAlignedVector<DataType> values_aligned(values);
+  detail::SumVectors(start, end, values_aligned.data(), values_aligned.data(),
+                     result.data());
+  for (int i = start; i < end; ++i) {
+    EXPECT_EQ(static_cast<float>(answer[i]), static_cast<float>(result[i]));
+  }
+}
+TEST(CsrBlockSparseMatrix, SumVectors_Generic) {
+  TestSumVectors<float>();
+  TestSumVectors<float>(1);
+  TestSumVectors<float>(1, 4);
+}
+TEST(CsrBlockSparseMatrix, SumVectors_Bfloat16) {
+  TestSumVectors<csrblocksparse::bfloat16>();
+  TestSumVectors<csrblocksparse::bfloat16>(1);
+  TestSumVectors<csrblocksparse::bfloat16>(1, 4);
+}
+// For SIMD-optimized SumVectors, the memory of the vector should be at least
+// |kSIMDWidth * sizeof(float)| long, and the start position has to be an
+// aligned memory location. So setting |size| to be 100 to be safe and
+// |start| to be 0 (|start| == 1 is not aligned).
+TEST(CsrBlockSparseMatrix, SumVectors_Fixed16) {
+  TestSumVectors<csrblocksparse::fixed16<8>>(0, -1, 100);
+  TestSumVectors<csrblocksparse::fixed16<8>>(0, 4, 100);
+}
+TEST(CsrBlockSparseMatrix, SumVectors_Fixed32) {
+  TestSumVectors<csrblocksparse::fixed32<11>>(0, -1, 100);
+  TestSumVectors<csrblocksparse::fixed32<11>>(0, 4, 100);
+}
+TEST(CsrBlockSparseMatrix, SpMM_Block4x4_Bfloat16) {
+  TestSpMM<csrblocksparse::bfloat16, float, float>(/*block_width=*/4,
+                                                   /*block_height=*/4,
+                                                   /*fatness=*/7);
+}
+// This actually uses multiple threads, and uses the output as the input for
+// multiple steps to test that synchronization and memory visibility is
+// working correctly.Requires square matrices.
+TEST(CsrBlockSparseMatrix, SpMV_4x4MultiThreading_Bfloat16) {
+  TestSpMM_MultiThread<csrblocksparse::bfloat16, float, float>(
+      /*block_width=*/4,
+      /*block_height=*/4,
+      /*fatness=*/1);
+}
+TEST(CsrBlockSparseMatrix, SpMM_4x4MultiThreading_Bfloat16) {
+  TestSpMM_MultiThread<csrblocksparse::bfloat16, float, float>(
+      /*block_width=*/4,
+      /*block_height=*/4,
+      /*fatness=*/7);
+}
+TEST(CsrBlockSparseMatrix, SpMV_Block1x1_Bfloat16) {
+  TestSpMM<csrblocksparse::bfloat16, float, float>(/*block_width=*/1,
+                                                   /*block_height=*/1,
+                                                   /*fatness=*/1);
+}
+TEST(CsrBlockSparseMatrix, SpMM_Block1x1_Bfloat16) {
+  TestSpMM<csrblocksparse::bfloat16, float, float>(/*block_width=*/1,
+                                                   /*block_height=*/1,
+                                                   /*fatness=*/7);
+}
+// This actually uses multiple threads, and uses the output as the input for
+// multiple steps to test that synchronization and memory visibility is
+// working correctly.Requires square matrices.
+TEST(CsrBlockSparseMatrix, SpMV_1x1MultiThreading_Bfloat16) {
+  TestSpMM_MultiThread<csrblocksparse::bfloat16, float, float>(
+      /*block_width=*/1,
+      /*block_height=*/1,
+      /*fatness=*/1);
+}
+TEST(CsrBlockSparseMatrix, SpMM_1x1MultiThreading_Bfloat16) {
+  TestSpMM_MultiThread<csrblocksparse::bfloat16, float, float>(
+      /*block_width=*/1,
+      /*block_height=*/1,
+      /*fatness=*/7);
+}
+TEST(CsrBlockSparseMatrix, SpMV_Block4x4_float) {
+  TestSpMM<float, float, float>(/*block_width=*/4,
+                                /*block_height=*/4,
+                                /*fatness=*/1,
+                                /*test_matmul=*/true);
+}
+TEST(CsrBlockSparseMatrix, SpMM_Block4x4_float) {
+  TestSpMM<float, float, float>(/*block_width=*/4,
+                                /*block_height=*/4,
+                                /*fatness=*/7);
+}
+// This actually uses multiple threads, and uses the output as the input for
+// multiple steps to test that synchronization and memory visibility is
+// working correctly.Requires square matrices.
+TEST(CsrBlockSparseMatrix, SpMV_4x4MultiThreading_float) {
+  TestSpMM_MultiThread<float, float, float>(/*block_width=*/4,
+                                            /*block_height=*/4,
+                                            /*fatness=*/1);
+}
+TEST(CsrBlockSparseMatrix, SpMM_4x4MultiThreading_float) {
+  TestSpMM_MultiThread<float, float, float>(/*block_width=*/4,
+                                            /*block_height=*/4,
+                                            /*fatness=*/7);
+}
+TEST(CsrBlockSparseMatrix, SpMV_Block1x1_float) {
+  TestSpMM<float, float, float>(/*block_width=*/1,
+                                /*block_height=*/1,
+                                /*fatness=*/1);
+}
+TEST(CsrBlockSparseMatrix, SpMM_Block1x1_float) {
+  TestSpMM<float, float, float>(/*block_width=*/1,
+                                /*block_height=*/1,
+                                /*fatness=*/7);
+}
+// This actually uses multiple threads, and uses the output as the input for
+// multiple steps to test that synchronization and memory visibility is
+// working correctly.Requires square matrices.
+TEST(CsrBlockSparseMatrix, SpMV_1x1MultiThreading_float) {
+  TestSpMM_MultiThread<float, float, float>(/*block_width=*/1,
+                                            /*block_height=*/1,
+                                            /*fatness=*/1);
+}
+TEST(CsrBlockSparseMatrix, SpMM_1x1MultiThreading_float) {
+  TestSpMM_MultiThread<float, float, float>(/*block_width=*/1,
+                                            /*block_height=*/1,
+                                            /*fatness=*/7);
+}
+TEST(CsrBlockSparseMatrix, SpMV_Block4x4_fixed16x16_32) {
+  TestSpMM<csrblocksparse::fixed16<4>, csrblocksparse::fixed16<4>,
+           typename csrblocksparse::TypeOfProduct<
+               csrblocksparse::fixed16<4>, csrblocksparse::fixed16<4>>::type>(
+      /*block_width=*/4,
+      /*block_height=*/4,
+      /*fatness=*/1,
+      /*test_matmul=*/true);
+}
+TEST(CsrBlockSparseMatrix, SpMM_Block4x4_fixed16x16_32) {
+  TestSpMM<csrblocksparse::fixed16<4>, csrblocksparse::fixed16<4>,
+           typename csrblocksparse::TypeOfProduct<
+               csrblocksparse::fixed16<4>, csrblocksparse::fixed16<4>>::type>(
+      /*block_width=*/4,
+      /*block_height=*/4,
+      /*fatness=*/7);
+}
+TEST(CsrBlockSparseMatrix, SpMV_Block1x1_fixed16x16_32) {
+  TestSpMM<csrblocksparse::fixed16<4>, csrblocksparse::fixed16<4>,
+           typename csrblocksparse::TypeOfProduct<
+               csrblocksparse::fixed16<4>, csrblocksparse::fixed16<4>>::type>(
+      /*block_width=*/1,
+      /*block_height=*/1,
+      /*fatness=*/1);
+}
+TEST(CsrBlockSparseMatrix, SpMM_Block1x1_fixed16x16_32) {
+  TestSpMM<csrblocksparse::fixed16<4>, csrblocksparse::fixed16<4>,
+           typename csrblocksparse::TypeOfProduct<
+               csrblocksparse::fixed16<4>, csrblocksparse::fixed16<4>>::type>(
+      /*block_width=*/1,
+      /*block_height=*/1,
+      /*fatness=*/7);
+}
+TEST(CsrBlockSparseMatrix, SpMV_Block4x4_fixed16x16_16) {
+  TestSpMM<csrblocksparse::fixed16<5>, csrblocksparse::fixed16<5>,
+           csrblocksparse::fixed16<8>>(
+      /*block_width=*/4,
+      /*block_height=*/4,
+      /*fatness=*/1,
+      /*test_matmul=*/true);
+}
+TEST(CsrBlockSparseMatrix, SpMM_Block4x4_fixed16x16_16) {
+  TestSpMM<csrblocksparse::fixed16<5>, csrblocksparse::fixed16<5>,
+           csrblocksparse::fixed16<8>>(
+      /*block_width=*/4,
+      /*block_height=*/4,
+      /*fatness=*/7);
+}
+TEST(CsrBlockSparseMatrix, SpMV_Block1x1_fixed16x16_16) {
+  TestSpMM<csrblocksparse::fixed16<5>, csrblocksparse::fixed16<5>,
+           csrblocksparse::fixed16<8>>(
+      /*block_width=*/1,
+      /*block_height=*/1,
+      /*fatness=*/1);
+}
+TEST(CsrBlockSparseMatrix, SpMM_Block1x1_fixed16x16_16) {
+  TestSpMM<csrblocksparse::fixed16<5>, csrblocksparse::fixed16<5>,
+           csrblocksparse::fixed16<8>>(
+      /*block_width=*/1,
+      /*block_height=*/1,
+      /*fatness=*/7);
+}
+TEST(CsrBlockSparseMatrix, SpMV_Block4x4_fixed16x16_32_unmatched) {
+  TestSpMM<csrblocksparse::fixed16<5>, csrblocksparse::fixed16<5>,
+           csrblocksparse::fixed32<13>>(
+      /*block_width=*/4,
+      /*block_height=*/4,
+      /*fatness=*/1,
+      /*test_matmul=*/true);
+}
+TEST(CsrBlockSparseMatrix, SpMM_Block4x4_fixed16x16_32_unmatched) {
+  TestSpMM<csrblocksparse::fixed16<5>, csrblocksparse::fixed16<5>,
+           csrblocksparse::fixed32<13>>(
+      /*block_width=*/4,
+      /*block_height=*/4,
+      /*fatness=*/7);
+}
+TEST(CsrBlockSparseMatrix, SpMV_Block1x1_fixed16x16_32_unmatched) {
+  TestSpMM<csrblocksparse::fixed16<5>, csrblocksparse::fixed16<5>,
+           csrblocksparse::fixed32<13>>(
+      /*block_width=*/1,
+      /*block_height=*/1,
+      /*fatness=*/1);
+}
+TEST(CsrBlockSparseMatrix, SpMM_Block1x1_fixed16x16_32_unmatched) {
+  TestSpMM<csrblocksparse::fixed16<5>, csrblocksparse::fixed16<5>,
+           csrblocksparse::fixed32<13>>(
+      /*block_width=*/1,
+      /*block_height=*/1,
+      /*fatness=*/7);
+}
+TEST(CsrBlockSparseMatrix, RhsIndicesDeltasRoundTrip) {
+  MaskedSparseMatrix<float> matrix(/*rows=*/256, /*cols=*/256,
+                                   /*sparsity=*/0.9, /*block_height=*/4,
+                                   /*block_width=*/4);
+  CsrBlockSparseMatrix<float, float> sparse_matrix(matrix);
+  CacheAlignedVector<int16_t> copy_indices = sparse_matrix.rhs_indices();
+  sparse_matrix.ComputeColDeltas();
+  sparse_matrix.ComputeRHSIndices();
+  // They get padded when created, so the newer one could be bigger.
+  EXPECT_LE(copy_indices.size(), sparse_matrix.rhs_indices().size());
+  for (int i = 0; i < copy_indices.size(); ++i) {
+    EXPECT_EQ(copy_indices[i], sparse_matrix.rhs_indices()[i]) << "i=" << i;
+  }
+}
+// Tests that a Layer that is split into 2 by columns (inputs) computes the same
+// result as the original layer.
+TEST(CsrBlockSparseMatrix, SplitByCol) {
+  int kRows = 1024;
+  int kCols = 1024;
+  MaskedSparseMatrix<float> matrix(kRows, kCols, 0.95, /*block_height=*/4,
+                                   /*block_width=*/4);
+  FatCacheAlignedVector<float> rhs(kCols, /*cols=*/1);
+  CacheAlignedVector<float> bias(kRows);
+  FatCacheAlignedVector<float> out1(kRows, /*cols=*/1);
+  FatCacheAlignedVector<float> out2(kRows, /*cols=*/1);
+  bias.FillRandom();
+  rhs.FillRandom();
+  out1.FillZero();
+  out2.FillZero();
+  FatCacheAlignedVector<float> out_reference = out1;
+  CsrBlockSparseMatrix<float, float> sparse_matrix(matrix);
+  SparseLinearLayer<float, float> sparse_linear_layer(std::move(sparse_matrix),
+                                                      std::move(bias));
+  sparse_linear_layer.PrepareForThreads(1);
+  sparse_linear_layer.SpMM_bias(rhs, &out_reference, /*relu=*/false,
+                                /*tid=*/0);
+  // Split the layer into 2 parts.
+  SparseLinearLayer<float, float> part1, part2;
+  sparse_linear_layer.SplitInputs(&part1, &part2);
+  part1.PrepareForThreads(1);
+  part2.PrepareForThreads(1);
+  EXPECT_EQ(kRows, part1.rows());
+  EXPECT_EQ(kCols / 2, part1.cols());
+  EXPECT_EQ(kRows, part2.rows());
+  EXPECT_EQ(kCols / 2, part2.cols());
+  MutableVectorView<float> rhs1(&rhs, 0, kCols / 2);
+  MutableVectorView<float> rhs2(&rhs, kCols / 2, kCols / 2);
+  for (int i = 0; i < kCols / 2; ++i) {
+    EXPECT_FLOAT_EQ(rhs[i], rhs1.data()[i]);
+    EXPECT_FLOAT_EQ(rhs[i + kCols / 2], rhs2.data()[i]);
+  }
+  part1.SpMM_bias(rhs1, &out1, /*relu=*/false, /*tid=*/0);
+  part2.SpMM_bias(rhs2, &out2, /*relu=*/false, /*tid=*/0);
+  // Check that out1 + out2 = out_reference.
+  for (int i = 0; i < kRows; ++i) {
+    EXPECT_NEAR(out_reference[i], out1[i] + out2[i], 2e-5)
+        << " i=" << i << " out1=" << out1[i] << " out2=" << out2[i];
+  }
+}
+// Tests that a Layer that is split into 2 by rows (outputs) computes the same
+// result as the original layer.
+TEST(CsrBlockSparseMatrix, SplitByRow) {
+  int kRows = 1024;
+  int kCols = 1024;
+  MaskedSparseMatrix<float> matrix(kRows, kCols, 0.95, /*block_height=*/4,
+                                   /*block_width=*/4);
+  FatCacheAlignedVector<float> rhs(kCols, /*cols=*/1);
+  CacheAlignedVector<float> bias(kRows);
+  FatCacheAlignedVector<float> out1(kRows, /*cols=*/1);
+  FatCacheAlignedVector<float> out2(kRows, /*cols=*/1);
+  bias.FillRandom();
+  rhs.FillRandom();
+  out1.FillZero();
+  out2.FillZero();
+  FatCacheAlignedVector<float> out_reference = out1;
+  CsrBlockSparseMatrix<float, float> sparse_matrix(matrix);
+  SparseLinearLayer<float, float> sparse_linear_layer(std::move(sparse_matrix),
+                                                      std::move(bias));
+  sparse_linear_layer.PrepareForThreads(1);
+  sparse_linear_layer.SpMM_bias(rhs, &out_reference, /*relu=*/false,
+                                /*tid=*/0);
+  // Split the layer into 2 parts.
+  SparseLinearLayer<float, float> part1, part2;
+  sparse_linear_layer.SplitOutputs(&part1, &part2);
+  part1.PrepareForThreads(1);
+  part2.PrepareForThreads(1);
+  EXPECT_EQ(kRows / 2, part1.rows());
+  EXPECT_EQ(kCols, part1.cols());
+  EXPECT_EQ(kRows / 2, part2.rows());
+  EXPECT_EQ(kCols, part2.cols());
+  MutableVectorView<float> out2a(&out2, 0, kRows / 2);
+  MutableVectorView<float> out2b(&out2, kRows / 2, kRows / 2);
+  part1.SpMM_bias(rhs, &out2a, /*relu=*/false, /*tid=*/0);
+  part2.SpMM_bias(rhs, &out2b, /*relu=*/false, /*tid=*/0);
+  // Check that out2 = out_reference.
+  for (int i = 0; i < kRows; ++i) {
+    EXPECT_NEAR(out_reference[i], out2[i], 2e-5)
+        << " i=" << i << " out1=" << out_reference[i] << " out2=" << out2[i];
+  }
+}
+TEST(CsrBlockSparseMatrix, MutableVectorView) {
+  const int kRows = 1024;
+  const int kCols = 1024;
+  const int kFatness = 2;
+  std::vector<float> values(kRows * kCols, 1.f);
+  std::vector<int> mask(kRows * kCols);
+  for (int i = 0; i < mask.size(); ++i) mask[i] = i % 2;
+  auto masked_matrix =
+      MaskedSparseMatrix<float>(kRows, kCols, mask.data(), values.data());
+  auto sparse_matrix = CsrBlockSparseMatrix<bfloat16, float>(masked_matrix);
+  FatCacheAlignedVector<float> x(kCols, kFatness);
+  x.FillOnes();
+  CacheAlignedVector<float> bias(kRows);
+  bias.FillZero();
+  // First check that we can use spans as output.  Split a multiplication
+  // into upper and lower halves times the full vector:
+  // ---------------  x   t
+  // |             |  x   t
+  // |             |  x   t
+  // ---------------    =
+  // |             |  x   b
+  // |             |  x   b
+  // ---------------  x   b
+  FatCacheAlignedVector<float> out(kRows, kFatness);
+  FatCacheAlignedVector<float> out_view(kRows, kFatness);
+  MutableVectorView<float> out_view_top(&out_view, 0, kRows / 2);
+  MutableVectorView<float> out_view_bottom(&out_view, kRows / 2, kRows / 2);
+  sparse_matrix.SpMM_bias(x, bias, &out);
+  auto masked_matrix_top =
+      MaskedSparseMatrix<float>(kRows / 2, kCols, mask.data(), values.data());
+  auto masked_matrix_bottom = MaskedSparseMatrix<float>(
+      kRows / 2, kCols, mask.data() + kRows * kCols / 2,
+      values.data() + kRows * kCols / 2);
+  auto sparse_matrix_top =
+      CsrBlockSparseMatrix<bfloat16, float>(masked_matrix_top);
+  auto sparse_matrix_bottom =
+      CsrBlockSparseMatrix<bfloat16, float>(masked_matrix_bottom);
+  sparse_matrix_top.SpMM_bias(x, bias, &out_view_top);
+  sparse_matrix_bottom.SpMM_bias(x, bias, &out_view_bottom);
+  CheckResult(out, out_view, kCols);
+  // Check that we can use a span as an input vector.  Multiply upper left
+  // portion of the matrix by the top half of the vector.
+  // ---------------
+  // |oooooo       |   x   q
+  // |oooooo       |   x   q
+  // |             |     =
+  // |             |
+  // ---------------
+  auto masked_matrix_quarter = MaskedSparseMatrix<float>(
+      kRows / 2, kCols / 2, mask.data(), values.data());
+  auto sparse_matrix_quarter =
+      CsrBlockSparseMatrix<bfloat16, float>(masked_matrix_quarter);
+  MutableVectorView<float> x_top(&x, 0, kCols / 2);
+  FatCacheAlignedVector<float> out_correct(kRows / 2, /*cols=*/2);
+  for (int i = 0; i < kFatness * (kRows / 2); ++i) out_correct[i] = 256.f;
+  MutableVectorView<float> bias_top(&bias, 0, kRows / 2);
+  FatCacheAlignedVector<float> out_quarter(kRows / 2, kFatness);
+  sparse_matrix_quarter.SpMM_bias(x_top, bias_top, &out_quarter);
+  CheckResult(out_correct, out_quarter, kCols / 2);
+}
+namespace {
+bool skip_test(const absl::Status& status, absl::string_view msg) {
+  if (!status.ok()) {
+    LOG(INFO) << "Couldn't load " << msg << ", skipping test " << status;
+    return true;
+  }
+  return false;
+}
+}  // namespace
+TEST(CsrBlockSparseMatrix, ModelMatrices_Bfloat16) {
+  std::vector<std::string> names = {
+      "768_512_95_4x4_wavernn_gru_", "768_512_95_4x4_coarseproj_",
+      "768_512_95_4x4_coarselogit_", "768_512_95_4x4_fineproj_",
+      "768_512_95_4x4_finelogit_",   "lyra_conv1d_"};
+  const std::string kPath =
+#if defined __arm__ || defined __aarch64__
+      "/data/local/tmp/";
+#else
+      (ghc::filesystem::current_path() / kTestdataPath).string();
+#endif
+  for (auto& layer_name : names) {
+    SparseLinearLayer<bfloat16, float> sparse_linear_layer;
+    auto status = LoadSparseLayer<bfloat16, float>(layer_name, /*zipped=*/true,
+                                                   &sparse_linear_layer, kPath);
+    // If the files don't exist on the device we're running on, just skip this
+    // test and log that it was skipped.
+    if (skip_test(status, layer_name)) return;
+    int rows = sparse_linear_layer.rows();
+    int cols = sparse_linear_layer.cols();
+    MaskedLinearLayer<float> masked_linear_layer;
+    status = LoadMaskedLayer<float>(layer_name, /*zipped=*/true,
+                                    &masked_linear_layer, kPath);
+    if (skip_test(status, layer_name)) return;
+    masked_linear_layer.CastWeights<csrblocksparse::bfloat16>();
+    CacheAlignedVector<float> rhs(cols);
+    CacheAlignedVector<float> out_ref(rows);
+    CacheAlignedVector<float> out_spmv(rows);
+    rhs.FillRandom();
+    out_ref.FillZero();
+    out_spmv.FillZero();
+    std::array<bool, 2> use_relus = {false, true};
+    for (bool use_relu : use_relus) {
+      masked_linear_layer.SpMM_bias(rhs, &out_ref, use_relu);
+      sparse_linear_layer.SpMM_bias(rhs, &out_spmv, use_relu);
+      CheckResult(out_ref, out_spmv, cols);
+    }
+  }
+}
+TEST(CsrBlockSparseMatrix, ModelMatrices_float) {
+  std::vector<std::string> names = {
+      "768_512_95_4x4_wavernn_gru_", "768_512_95_4x4_coarseproj_",
+      "768_512_95_4x4_coarselogit_", "768_512_95_4x4_fineproj_",
+      "768_512_95_4x4_finelogit_",   "lyra_conv1d_"};
+  const std::string kPath =
+#if defined __arm__ || defined __aarch64__
+      "/data/local/tmp/";
+#else
+      (ghc::filesystem::current_path() / kTestdataPath).string();
+#endif
+  for (auto& layer_name : names) {
+    SparseLinearLayer<float, float> sparse_linear_layer;
+    auto status = LoadSparseLayer<float, float>(layer_name, /*zipped=*/true,
+                                                &sparse_linear_layer, kPath);
+    // If the files don't exist on the device we're running on, just skip this
+    // test and log that it was skipped.
+    if (skip_test(status, layer_name)) return;
+    int rows = sparse_linear_layer.rows();
+    int cols = sparse_linear_layer.cols();
+    MaskedLinearLayer<float> masked_linear_layer;
+    status = LoadMaskedLayer<float>(layer_name, /*zipped=*/true,
+                                    &masked_linear_layer, kPath);
+    if (skip_test(status, layer_name)) return;
+    CacheAlignedVector<float> rhs(cols);
+    CacheAlignedVector<float> out_ref(rows);
+    CacheAlignedVector<float> out_spmv(rows);
+    rhs.FillRandom();
+    out_ref.FillZero();
+    out_spmv.FillZero();
+    std::array<bool, 2> use_relus = {false, true};
+    for (bool use_relu : use_relus) {
+      masked_linear_layer.SpMM_bias(rhs, &out_ref, use_relu);
+      sparse_linear_layer.SpMM_bias(rhs, &out_spmv, use_relu);
+      CheckResult(out_ref, out_spmv, cols);
+    }
+  }
+}
+#undef SKIP_TEST
+}  // namespace csrblocksparse

sparse_matmul/layers/errno_mapping.cc ADDED Viewed

	@@ -0,0 +1,195 @@

+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "sparse_matmul/layers/errno_mapping.h"
+#include <string>
+#include "absl/strings/str_cat.h"
+namespace csrblocksparse {
+namespace {
+absl::StatusCode ErrnoToCode(int error_number) {
+  switch (error_number) {
+    case 0:
+      return absl::StatusCode::kOk;
+    case EINVAL:        // Invalid argument
+    case ENAMETOOLONG:  // Filename too long
+    case E2BIG:         // Argument list too long
+    case EDESTADDRREQ:  // Destination address required
+    case EDOM:          // Mathematics argument out of domain of function
+    case EFAULT:        // Bad address
+    case EILSEQ:        // Illegal byte sequence
+    case ENOPROTOOPT:   // Protocol not available
+    case ENOSTR:        // Not a STREAM
+    case ENOTSOCK:      // Not a socket
+    case ENOTTY:        // Inappropriate I/O control operation
+    case EPROTOTYPE:    // Protocol wrong type for socket
+    case ESPIPE:        // Invalid seek
+      return absl::StatusCode::kInvalidArgument;
+    case ETIMEDOUT:  // Connection timed out
+    case ETIME:      // Timer expired
+      return absl::StatusCode::kDeadlineExceeded;
+    case ENODEV:  // No such device
+    case ENOENT:  // No such file or directory
+#ifdef ENOMEDIUM
+    case ENOMEDIUM:  // No medium found
+#endif
+    case ENXIO:  // No such device or address
+    case ESRCH:  // No such process
+      return absl::StatusCode::kNotFound;
+    case EEXIST:         // File exists
+    case EADDRNOTAVAIL:  // Address not available
+    case EALREADY:       // Connection already in progress
+#ifdef ENOTUNIQ
+    case ENOTUNIQ:  // Name not unique on network
+#endif
+      return absl::StatusCode::kAlreadyExists;
+    case EPERM:   // Operation not permitted
+    case EACCES:  // Permission denied
+#ifdef ENOKEY
+    case ENOKEY:  // Required key not available
+#endif
+    case EROFS:  // Read only file system
+      return absl::StatusCode::kPermissionDenied;
+    case ENOTEMPTY:   // Directory not empty
+    case EISDIR:      // Is a directory
+    case ENOTDIR:     // Not a directory
+    case EADDRINUSE:  // Address already in use
+    case EBADF:       // Invalid file descriptor
+#ifdef EBADFD
+    case EBADFD:  // File descriptor in bad state
+#endif
+    case EBUSY:    // Device or resource busy
+    case ECHILD:   // No child processes
+    case EISCONN:  // Socket is connected
+#ifdef EISNAM
+    case EISNAM:  // Is a named type file
+#endif
+#ifdef ENOTBLK
+    case ENOTBLK:  // Block device required
+#endif
+    case ENOTCONN:  // The socket is not connected
+    case EPIPE:     // Broken pipe
+#ifdef ESHUTDOWN
+    case ESHUTDOWN:  // Cannot send after transport endpoint shutdown
+#endif
+    case ETXTBSY:  // Text file busy
+#ifdef EUNATCH
+    case EUNATCH:  // Protocol driver not attached
+#endif
+      return absl::StatusCode::kFailedPrecondition;
+    case ENOSPC:  // No space left on device
+#ifdef EDQUOT
+    case EDQUOT:  // Disk quota exceeded
+#endif
+    case EMFILE:   // Too many open files
+    case EMLINK:   // Too many links
+    case ENFILE:   // Too many open files in system
+    case ENOBUFS:  // No buffer space available
+    case ENODATA:  // No message is available on the STREAM read queue
+    case ENOMEM:   // Not enough space
+    case ENOSR:    // No STREAM resources
+#ifdef EUSERS
+    case EUSERS:  // Too many users
+#endif
+      return absl::StatusCode::kResourceExhausted;
+#ifdef ECHRNG
+    case ECHRNG:  // Channel number out of range
+#endif
+    case EFBIG:      // File too large
+    case EOVERFLOW:  // Value too large to be stored in data type
+    case ERANGE:     // Result too large
+      return absl::StatusCode::kOutOfRange;
+#ifdef ENOPKG
+    case ENOPKG:  // Package not installed
+#endif
+    case ENOSYS:        // Function not implemented
+    case ENOTSUP:       // Operation not supported
+    case EAFNOSUPPORT:  // Address family not supported
+#ifdef EPFNOSUPPORT
+    case EPFNOSUPPORT:  // Protocol family not supported
+#endif
+    case EPROTONOSUPPORT:  // Protocol not supported
+#ifdef ESOCKTNOSUPPORT
+    case ESOCKTNOSUPPORT:  // Socket type not supported
+#endif
+    case EXDEV:  // Improper link
+      return absl::StatusCode::kUnimplemented;
+    case EAGAIN:  // Resource temporarily unavailable
+#ifdef ECOMM
+    case ECOMM:  // Communication error on send
+#endif
+    case ECONNREFUSED:  // Connection refused
+    case ECONNABORTED:  // Connection aborted
+    case ECONNRESET:    // Connection reset
+    case EINTR:         // Interrupted function call
+#ifdef EHOSTDOWN
+    case EHOSTDOWN:  // Host is down
+#endif
+    case EHOSTUNREACH:  // Host is unreachable
+    case ENETDOWN:      // Network is down
+    case ENETRESET:     // Connection aborted by network
+    case ENETUNREACH:   // Network unreachable
+    case ENOLCK:        // No locks available
+    case ENOLINK:       // Link has been severed
+#ifdef ENONET
+    case ENONET:  // Machine is not on the network
+#endif
+      return absl::StatusCode::kUnavailable;
+    case EDEADLK:  // Resource deadlock avoided
+#ifdef ESTALE
+    case ESTALE:  // Stale file handle
+#endif
+      return absl::StatusCode::kAborted;
+    case ECANCELED:  // Operation cancelled
+      return absl::StatusCode::kCancelled;
+    default:
+      return absl::StatusCode::kUnknown;
+  }
+}
+// POSIX `strerror_r()` returns `int`.
+ABSL_ATTRIBUTE_UNUSED std::string StrErrorResult(int result, const char* buffer,
+                                                 int error_code) {
+  if (ABSL_PREDICT_FALSE(result != 0)) {
+    return absl::StrCat("Unknown error ", error_code);
+  }
+  return buffer;
+}
+// GNU `strerror_r()` returns `char*`.
+ABSL_ATTRIBUTE_UNUSED std::string StrErrorResult(char* result,
+                                                 const char* buffer,
+                                                 int error_code) {
+  return result;
+}
+std::string StrError(int error_code) {
+  char message[256];
+  return StrErrorResult(strerror_r(error_code, message, sizeof(message)),
+                        message, error_code);
+}
+}  // namespace
+absl::Status ErrnoToCanonicalStatus(int error_number,
+                                    absl::string_view message) {
+  return absl::Status(ErrnoToCode(error_number),
+                      absl::StrCat(message, ": ", StrError(error_number)));
+}
+}  // namespace csrblocksparse

sparse_matmul/layers/errno_mapping.h ADDED Viewed

	@@ -0,0 +1,29 @@

+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef THIRD_PARTY_LYRA_CODEC_SPARSE_MATMUL_LAYERS_ERRNO_MAPPING_H_
+#define THIRD_PARTY_LYRA_CODEC_SPARSE_MATMUL_LAYERS_ERRNO_MAPPING_H_
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+namespace csrblocksparse {
+// Converts |error_number| value to absl::Status.
+absl::Status ErrnoToCanonicalStatus(int error_number,
+                                    absl::string_view message);
+}  // namespace csrblocksparse
+#endif  // THIRD_PARTY_LYRA_CODEC_SPARSE_MATMUL_LAYERS_ERRNO_MAPPING_H_

sparse_matmul/layers/masked_sparse_matrix.h ADDED Viewed

	@@ -0,0 +1,206 @@

+/*
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef LYRA_CODEC_SPARSE_MATMUL_LAYERS_MASKED_SPARSE_MATRIX_H_
+#define LYRA_CODEC_SPARSE_MATMUL_LAYERS_MASKED_SPARSE_MATRIX_H_
+#include <algorithm>
+#include <cstdio>
+#include <numeric>
+#include <vector>
+#include "absl/strings/str_format.h"
+#include "sparse_matmul/vector/cache_aligned_vector.h"
+namespace csrblocksparse {
+// MaskedSparseMatrix serves two purposes:
+// 1) It is useful as a reference implementation of SpMV for correctness
+//    checking the much more complicated implementations in CSRBlockSparseMatrix
+// 2) This is the format that sparse matrices are represented after pruning
+//    in TF.  This class provides a bridge to getting these parameters into
+//    a compressed form suitable for computation and serialization.
+//
+//  MaskedSparseMatrix<float> matrix(rows, cols, mask_from_tf, values_from_tf);
+//  CSRBlockSparseMatrix<float, bfloat16, int16_t> csr_matrix(matrix);
+//  csr_matrix.Multiply(rhs, bias, &out);
+template <typename T>
+class MaskedSparseMatrix {
+ public:
+  MaskedSparseMatrix() {}
+  // Construct a MaskedSparseMatrix of the given size, sparsity and block size.
+  // This is mainly useful for testing.
+  MaskedSparseMatrix(int rows, int cols, float sparsity, int block_height = 1,
+                     int block_width = 1, float constant = 1.f,
+                     bool random = true)
+      : rows_(rows), cols_(cols), sparsity_(sparsity) {
+    CHECK_EQ(rows % block_height, 0);
+    CHECK_EQ(cols % block_width, 0);
+    init(sparsity, block_height, block_width, constant, random);
+  }
+  // Construct from an existing mask and values (most likely from a TF model).
+  template <typename MaskType>
+  MaskedSparseMatrix(int rows, int cols, const MaskType* mask, const T* values)
+      : rows_(rows), cols_(cols) {
+    mask_.resize(rows * cols);
+    values_.resize(rows * cols);
+    std::copy_n(mask, rows * cols, mask_.begin());
+    std::copy_n(values, rows * cols, values_.begin());
+    sparsity_ =
+        1.f - std::accumulate(mask_.begin(), mask_.end(), 0.f) / mask_.size();
+  }
+  const std::vector<int>& mask() const { return mask_; }
+  const std::vector<T>& values() const { return values_; }
+  T* data() { return values_.data(); }
+  const T* data() const { return values_.data(); }
+  int rows() const { return rows_; }
+  int cols() const { return cols_; }
+  float sparsity() const { return sparsity_; }
+  void Print() const {
+    absl::PrintF("-------Values---------\n");
+    for (int r = 0; r < rows_; ++r) {
+      for (int c = 0; c < cols_; ++c) {
+        absl::PrintF("%+6.3f ", static_cast<float>(values_[r * cols_ + c]));
+      }
+      absl::PrintF("\n");
+    }
+    absl::PrintF("-------Mask---------\n");
+    for (int r = 0; r < rows_; ++r) {
+      for (int c = 0; c < cols_; ++c) {
+        printf("%2d ", mask_[r * cols_ + c]);
+      }
+      absl::PrintF("\n");
+    }
+  }
+  // This routine is useful for rounding the possibly higher precision values
+  // stored in this class to a lower precision, so that correctness checks
+  // between this class and CSRBlockSparseMatrix can have a tighter tolerance.
+  template <typename U>
+  void CastWeights() {
+    for (int i = 0; i < values_.size(); ++i) {
+      values_[i] = static_cast<T>(U(values_[i]));
+    }
+  }
+  // Only meant for correctness checking.
+  // RhsClassType is meant to be either CacheAlignedVector OR
+  // FatCacheAlignedVector.
+  // The weight matrix is ROW MAJOR and RhsClassType is COLUMN MAJOR.
+  // |bias| is broadcast if |rhs| has more than one column.
+  template <typename RhsClassType, typename BiasType, typename OutClassType,
+            typename RhsType = typename RhsClassType::value_type,
+            typename OutType = typename OutClassType::value_type>
+  void SpMM_bias(const RhsClassType& rhs,
+                 const CacheAlignedVector<BiasType>& bias, OutClassType* out,
+                 bool relu = false) {
+    for (int r = 0; r < rows_; ++r) {
+      for (int n = 0; n < rhs.cols(); ++n) {
+        float sum = 0.f;
+        const RhsType* rhs_ptr = rhs.data() + n * rhs.rows();
+        OutType* out_ptr = out->data() + n * out->rows();
+        const int* mask_ptr = mask_.data() + r * cols_;
+        const T* value_ptr = values_.data() + r * cols_;
+        for (int c = 0; c < cols_; ++c) {
+          sum += mask_ptr[c] * static_cast<float>(value_ptr[c]) *
+                 static_cast<float>(rhs_ptr[c]);
+        }
+        out_ptr[r] = static_cast<OutType>(
+            relu ? std::max(sum + static_cast<float>(bias[r]), 0.f)
+                 : sum + static_cast<float>(bias[r]));
+      }
+    }
+  }
+ private:
+  // Generate a random matrix with the specified sparsity.
+  // Useful for testing.
+  void init(float sparsity, int block_height, int block_width, float constant,
+            bool random = true) {
+    int reduced_rows = rows_ / block_height;
+    int reduced_cols = cols_ / block_width;
+    mask_.resize(rows_ * cols_, 0);
+    // Fill with non-zero value to make sure masking works.
+    values_.resize(rows_ * cols_, static_cast<T>(2.f));
+    std::mt19937 generator(0);
+    std::uniform_real_distribution<float> dist_sparsity;
+    std::uniform_real_distribution<float> dist_value(-1.f, 1.f);
+    int nnz = 0;
+    while (nnz == 0) {
+      for (int r = 0; r < reduced_rows; ++r) {
+        for (int c = 0; c < reduced_cols; ++c) {
+          if (dist_sparsity(generator) > sparsity) {
+            nnz++;
+            for (int i = 0; i < block_height; ++i) {
+              for (int j = 0; j < block_width; ++j) {
+                mask_[(r * block_height + i) * cols_ + block_width * c + j] = 1;
+                values_[(r * block_height + i) * cols_ + block_width * c + j] =
+                    static_cast<T>(random ? dist_value(generator) : constant);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  std::vector<int> mask_;
+  std::vector<T> values_;
+  int rows_;
+  int cols_;
+  float sparsity_;
+};
+template <typename T>
+class MaskedLinearLayer {
+ public:
+  MaskedLinearLayer(MaskedSparseMatrix<T>&& weights,
+                    CacheAlignedVector<T>&& bias)
+      : weights_(std::move(weights)), bias_(std::move(bias)) {}
+  MaskedLinearLayer() {}
+  template <typename U>
+  void CastWeights() {
+    weights_.template CastWeights<U>();
+  }
+  // Does Ax + b where A is a masked sparse ROW MAJOR matrix and
+  // x is a COLUMN MAJOR dense vector or matrix.  Bias is a vector that is
+  // broadcast is rhs has more than one column.
+  template <typename FatVector>
+  void SpMM_bias(const FatVector& rhs, FatVector* out, bool relu = false) {
+    static_assert(std::is_same<typename FatVector::value_type, T>::value,
+                  "FatVector value_type must match masked_linear_layer type");
+    weights_.SpMM_bias(rhs, bias_, out, relu);
+  }
+ private:
+  MaskedSparseMatrix<T> weights_;
+  CacheAlignedVector<T> bias_;
+};
+}  // namespace csrblocksparse
+#endif  // LYRA_CODEC_SPARSE_MATMUL_LAYERS_MASKED_SPARSE_MATRIX_H_

sparse_matmul/layers/read_array_ifstream.h ADDED Viewed

	@@ -0,0 +1,66 @@

+/*
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// Low-level array reading function using std::ifstream.
+#ifndef LYRA_CODEC_SPARSE_MATMUL_LAYERS_READ_ARRAY_IFSTREAM_H_
+#define LYRA_CODEC_SPARSE_MATMUL_LAYERS_READ_ARRAY_IFSTREAM_H_
+#include <cstdint>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include "absl/status/status.h"
+#include "absl/strings/substitute.h"
+#include "include/ghc/filesystem.hpp"
+namespace csrblocksparse {
+namespace detail {
+template <typename T>
+absl::Status ReadArrayIfstream(const std::string& file_name,
+                               const std::string& path, std::vector<T>* array,
+                               int64_t* length) {
+  ghc::filesystem::path complete_path(path);
+  complete_path /= file_name;
+  std::ifstream in_stream(complete_path.u8string(), std::ios::binary);
+  if (!in_stream.is_open()) {
+    return absl::UnknownError(
+        absl::Substitute("Error opening $0", complete_path.string()));
+  }
+  std::stringstream buffer;
+  buffer << in_stream.rdbuf();
+  if (buffer.str().empty()) {
+    LOG(ERROR) << "File " << complete_path << " was empty.";
+    return absl::UnknownError(
+        absl::Substitute("File $0 was empty", complete_path.string()));
+  }
+  std::string contents = buffer.str();
+  *length = contents.length();
+  int64_t elem = (*length + sizeof(T) - 1) / sizeof(T);
+  array->resize(elem);
+  std::move(contents.begin(), contents.end(),
+            reinterpret_cast<char*>(array->data()));
+  return absl::OkStatus();
+}
+}  // namespace detail
+}  // namespace csrblocksparse
+#endif  // LYRA_CODEC_SPARSE_MATMUL_LAYERS_READ_ARRAY_IFSTREAM_H_

sparse_matmul/layers/sparse_linear_layer.h ADDED Viewed

	@@ -0,0 +1,365 @@

+/*
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef LYRA_CODEC_SPARSE_MATMUL_LAYERS_SPARSE_LINEAR_LAYER_H_
+#define LYRA_CODEC_SPARSE_MATMUL_LAYERS_SPARSE_LINEAR_LAYER_H_
+#include <cstdint>
+#include "absl/memory/memory.h"
+#include "glog/logging.h"
+#include "sparse_matmul/layers/csr_blocksparse_matrix.h"
+#include "sparse_matmul/layers/masked_sparse_matrix.h"
+#include "sparse_matmul/numerics/type_utils.h"
+#include "sparse_matmul/os/coop_threads.h"
+#include "sparse_matmul/vector/cache_aligned_vector.h"
+namespace csrblocksparse {
+template <typename WeightType, typename RhsType,
+          typename BiasType = typename TypeOfProduct<WeightType, RhsType>::type,
+          typename DeltaType = int16_t>
+class SparseLinearLayer {
+ public:
+  SparseLinearLayer() {}
+  SparseLinearLayer(CsrBlockSparseMatrix<WeightType, RhsType>&& sparse_matrix,
+                    CacheAlignedVector<BiasType>&& bias)
+      : sparse_matrix_(std::move(sparse_matrix)), full_bias_(std::move(bias)) {
+    CHECK_EQ(sparse_matrix_.rows(), full_bias_.size());
+    // Some kernels expect that the bias is divided by 4, so we store a second
+    // copy of a quarter of the bias.
+    // TODO(b/189958858): Remove the quartered bias if it can be done without
+    // loss of speed, and rename the |full_bias_| member back to |bias_|.
+    bias_ = full_bias_;
+    for (int i = 0; i < bias_.size(); ++i) {
+      bias_[i] = static_cast<BiasType>(.25f * static_cast<float>(bias_[i]));
+    }
+  }
+  SparseLinearLayer(
+      const SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>& src) {
+    *this = src;
+  }
+  SparseLinearLayer& operator=(
+      const SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>& src) {
+    sparse_matrix_ = src.sparse_matrix_;
+    bias_ = src.bias_;
+    full_bias_ = src.full_bias_;
+    mid_output_ = src.mid_output_;
+    thread_layers_ = src.thread_layers_;
+    num_threads_ = src.num_threads_;
+    if (src.split_pc_) {
+      split_pc_ = absl::make_unique<ProducerConsumer>(
+          src.split_pc_->num_producers(), src.split_pc_->num_consumers());
+    }
+    return *this;
+  }
+  // Does Ax + b where A is a block sparse compressed sparse row matrix and
+  // x is a COLUMN MAJOR dense vector or matrix.  Bias is a vector that is
+  // broadcast if rhs has more than one column.
+  template <typename RhsClassType, typename OutType>
+  void SpMM_bias(const RhsClassType& rhs, OutType* out, bool relu = false,
+                 int tid = 0, SpinBarrier* barrier = nullptr) const {
+    static_assert(
+        std::is_same<typename RhsClassType::value_type, RhsType>::value, "");
+    sparse_matrix_.SpMM_bias(rhs, bias_, out, relu, tid, barrier);
+  }
+  // Multiplies a sparse matrix by a possibly dense matrix, as SpMM_bias above,
+  // and then samples from the output (softmax distribution) layer.
+  template <typename RhsClassType, typename OutType>
+  int SpMM_bias_Sample(const RhsClassType& rhs, OutType* out, float temperature,
+                       int tid, SpinBarrier* barrier, std::minstd_rand* gen,
+                       CacheAlignedVector<float>* scratch) const {
+    static_assert(
+        std::is_same<typename RhsClassType::value_type, RhsType>::value, "");
+    return sparse_matrix_.SpMM_bias_Sample(rhs, bias_, out, temperature, tid,
+                                           barrier, gen, scratch);
+  }
+  template <typename RhsClassType, typename OutType>
+  void MatVec(const RhsClassType& rhs, bool relu, int tid, int replicas,
+              int output_stride, OutType* output,
+              SpinBarrier* barrier = nullptr) {
+    static_assert(
+        std::is_same<typename RhsClassType::value_type, RhsType>::value, "");
+#ifdef __AVX2__
+    if (block_width() == 4 && (block_height() == 4 || block_height() == 8) &&
+        !IsCustomFloatType<WeightType>::value) {
+      if (!IsSplit()) {
+        sparse_matrix_.MatVec(rhs.cast_data(), full_bias_.cast_data(), relu,
+                              tid, replicas, output_stride, output->data());
+        if (barrier != nullptr) barrier->barrier();
+        return;
+      }
+      // NOTE: Until the quartered bias is removed it is a bad idea to split
+      // for ARM in the same way, as we would have to quarter the output of
+      // the first part of the split before running the second part.
+      // Signal completion of the previous MatVec.
+      split_pc_->produce();
+      PartLinearLayer& thread_part = thread_layers_[tid];
+      auto offset_output =
+          sparse_matrix_.thread_bounds().OffsetOutput(output->data(), tid);
+      auto mid_output =
+          sparse_matrix_.thread_bounds().OffsetOutput(mid_output_.data(), tid);
+      auto offset_bias = sparse_matrix_.thread_bounds().OffsetOutput(
+          mid_output_.cast_data(), tid);
+      // We can continue to consume the data that this thread produced and
+      // compute just the |self_matrix| part.
+      // No |relu| or |replicas|, as this is only a partial matmul.
+      // |tid| is always zero because the matrix has been split by tid.
+      thread_part.self_matrix.MatVec(
+          rhs.cast_data(), thread_part.full_bias.cast_data(), /*relu=*/false,
+          /*tid=*/0, /*replicas=*/1, output_stride, mid_output);
+      // We have to wait for the other threads to finish working on the previous
+      // MatMul before consuming the rest of |rhs|.
+      split_pc_->consume();
+      thread_part.other_matrix.MatVec(rhs.cast_data(), offset_bias, relu,
+                                      /*tid=*/0, replicas, output_stride,
+                                      offset_output);
+      return;
+    }
+#endif
+    DCHECK_EQ(replicas, 1) << "Must have single replica for SpMM API";
+    if (IsSplit()) {
+      // Generics aren't setup to use a split matrix. This will be inefficient.
+      split_pc_->produce();
+      split_pc_->consume();
+    }
+    if (block_height() == 8) {
+      // We are currently forced to use MatVec generics for this case.
+      LOG(WARNING) << "Need to implement MatVec for 8x4 for non-AVX2 targets!!";
+      sparse_matrix_.MatVec(rhs.cast_data(), full_bias_.cast_data(), relu, tid,
+                            replicas, output_stride, output->data());
+      if (barrier != nullptr) barrier->barrier();
+    } else {
+      sparse_matrix_.SpMM_bias(rhs, bias_, output, relu, tid, barrier);
+    }
+  }
+  int rows() const { return sparse_matrix_.rows(); }
+  int cols() const { return sparse_matrix_.cols(); }
+  float sparsity() const { return sparse_matrix_.sparsity(); }
+  int block_width() const { return sparse_matrix_.block_width(); }
+  int block_height() const { return sparse_matrix_.block_height(); }
+  int num_threads() const { return sparse_matrix_.num_threads(); }
+  const CacheAlignedVector<BiasType>& bias() const { return bias_; }
+  const std::vector<int>& split_points() const {
+    return sparse_matrix_.split_points();
+  }
+  bool IsSplit() const {
+    return !thread_layers_.empty() && split_pc_ != nullptr;
+  }
+  std::size_t bytes() const { return sparse_matrix_.bytes() + bias_.bytes(); }
+  void Print() const {
+    printf("Matrix\n");
+    sparse_matrix_.Print();
+    printf("Bias\n");
+    bias_.Print();
+  }
+  // Combines adjacent row blocks, doubling the block height.
+  // This necessarily involves adding zero weights where the blocks don't align
+  // across adjacent pairs of rows, so use with caution, as the resulting matrix
+  // is most likely to run slower if very sparse to begin with.
+  // In the few cases where the blocks do mostly align, the resulting matmul
+  // could be much faster, as the number of reads of the rhs will be halved.
+  void DoubleBlockHeight() { sparse_matrix_.DoubleBlockHeight(); }
+  // Cache_line_size is provided only for testing. Normally uses a value for
+  // the current architecture.
+  int PrepareForThreads(int num_threads, int cache_line_size = -1) {
+    num_threads_ = num_threads;
+    if (num_threads_ > 1) {
+      split_pc_ =
+          absl::make_unique<ProducerConsumer>(num_threads_, num_threads_);
+    } else {
+      split_pc_.reset(nullptr);
+    }
+    return sparse_matrix_.PrepareForThreads(num_threads, cache_line_size);
+  }
+  // Partitions the matrix into pieces by thread.
+  // In this matrix, we can go ahead and calculate the part that only depends
+  // on rhs inputs that were generated by this thread in the previous matvec,
+  // without having to use any thread synchronization, and only after that do we
+  // have to wait for the other threads to finish the previous matvec.
+  // So we split the matrix using the |split_points| from the previous matrix
+  // into 2 * |num_threads_| pieces: self and other for each thread, being the
+  //  parts that can be calculated before and after the other threads have
+  // completed their calculation of the previous matvec.
+  // We then have to use a ProducerConsumer lock instead of a SpinBarrier to
+  // synchronize the data produced by the other threads.
+  void SliceForThreads(const std::vector<int>& split_points) {
+    thread_layers_.clear();
+    thread_layers_.reserve(num_threads_);
+    LOG(INFO) << "Slicing " << rows() << "x" << cols() << " matrix for "
+              << num_threads_ << " threads";
+    for (int tid = 0; tid < num_threads_; ++tid) {
+      thread_layers_.emplace_back(
+          sparse_matrix_, full_bias_, bias_, tid,
+          split_points[tid] * sparse_matrix_.block_height(),
+          split_points[tid + 1] * sparse_matrix_.block_height());
+    }
+    mid_output_ =
+        std::move(csrblocksparse::CacheAlignedVector<BiasType>(rows()));
+    mid_output_.FillZero();
+  }
+  // Splits the layer by inputs into 2 equal pieces. Each of the resulting
+  // layers should be computed independently on the first and second halves of
+  // the inputs respectively and the results added to achieve the same effect
+  // as the original layer.
+  void SplitInputs(
+      SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>* part1,
+      SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>* part2) {
+    CsrBlockSparseMatrix<WeightType, RhsType> matrix1(
+        sparse_matrix_.SplitByColumn(0, sparse_matrix_.cols() / 2));
+    CsrBlockSparseMatrix<WeightType, RhsType> matrix2(
+        sparse_matrix_.SplitByColumn(sparse_matrix_.cols() / 2,
+                                     sparse_matrix_.cols()));
+    *part1 =
+        std::move(SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>(
+            std::move(matrix1),
+            std::move(CacheAlignedVector<BiasType>(full_bias_))));
+    CacheAlignedVector<BiasType> bias2(sparse_matrix_.rows());
+    bias2.FillZero();
+    *part2 =
+        std::move(SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>(
+            std::move(matrix2), std::move(bias2)));
+  }
+  // Splits the layer by outputs into 2 equal pieces. Each of the resulting
+  // layers should be computed independently on the full inputs and the results
+  // concatenated to achieve the same effect as the original layer.
+  void SplitOutputs(
+      SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>* part1,
+      SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>* part2) {
+    LOG(INFO) << "input rows=" << sparse_matrix_.rows()
+              << ", cols=" << sparse_matrix_.cols();
+    CsrBlockSparseMatrix<WeightType, RhsType> matrix1(
+        sparse_matrix_.SplitByRow(0, sparse_matrix_.rows() / 2));
+    CsrBlockSparseMatrix<WeightType, RhsType> matrix2(sparse_matrix_.SplitByRow(
+        sparse_matrix_.rows() / 2, sparse_matrix_.rows()));
+    CacheAlignedVector<BiasType> bias1(full_bias_, 0, full_bias_.size() / 2);
+    *part1 =
+        std::move(SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>(
+            std::move(matrix1), std::move(bias1)));
+    CacheAlignedVector<BiasType> bias2(full_bias_, full_bias_.size() / 2,
+                                       full_bias_.size());
+    *part2 =
+        std::move(SparseLinearLayer<WeightType, RhsType, BiasType, DeltaType>(
+            std::move(matrix2), std::move(bias2)));
+  }
+ private:
+  // Simple struct to hold a partitioned layer.
+  struct PartLinearLayer {
+    // The original matrix is first split by row to generate only the outputs
+    // for the given tid. The |row_sub_matrix| is then split by column into two
+    // partitions:
+    // self is the part for which the rhs elements in [|start_col|, |end_col|)
+    // were generated by this thread in some previous matmul.
+    // |other| is the rest of the columns that require rhs elements from other
+    // threads.
+    // NOTE that| start_col|, |end_col| are in raw columns, not blocks.
+    PartLinearLayer(const CsrBlockSparseMatrix<WeightType, RhsType>& matrix,
+                    const CacheAlignedVector<BiasType>& bias,
+                    const CacheAlignedVector<BiasType>& bias_4, int tid,
+                    int start_col, int end_col) {
+      int block_height = matrix.block_height();
+      // Split the input matrix by row, selecting only the rows relevant to
+      // thread tid.
+      int start_row = matrix.split_points()[tid] * block_height;
+      int end_row = matrix.split_points()[tid + 1] * block_height;
+      LOG(INFO) << "input cols [" << start_col << "," << end_col << ") rows ["
+                << start_row << "," << end_row << ")";
+      CsrBlockSparseMatrix<WeightType, RhsType> row_sub_matrix =
+          matrix.SplitByRow(start_row, end_row);
+      // Partition into the columns that use rhs elements that thread tid
+      // produced in a previous matmul, and the other rhs elements.
+      // NOTE that we |keep_rhs_size|=true so that each matrix can operate on
+      // the same rhs input vector. The self matrix just guarantees not to
+      // access any of the elements that are generated by another thread.
+      self_matrix = std::move(row_sub_matrix.SplitByColumn(
+          start_col, end_col, /*keep_rhs_size=*/true));
+      self_matrix.PrepareForThreads(1);
+      // The reversed start and end slice out the complement of [start, end).
+      other_matrix = std::move(row_sub_matrix.SplitByColumn(
+          end_col, start_col, /*keep_rhs_size=*/true));
+      other_matrix.PrepareForThreads(1);
+      full_bias =
+          std::move(CacheAlignedVector<BiasType>(bias, start_row, end_row));
+      // TODO(b/189958858): Eliminate the quarter bias from all the code.
+      quarter_bias =
+          std::move(CacheAlignedVector<BiasType>(bias_4, start_row, end_row));
+    }
+    // The part of the matrix that only depends on this thread for rhs inputs.
+    CsrBlockSparseMatrix<WeightType, RhsType> self_matrix;
+    CacheAlignedVector<BiasType> full_bias;
+    CacheAlignedVector<BiasType> quarter_bias;
+    // The part of the matrix that uses rhs inputs from other threads.
+    CsrBlockSparseMatrix<WeightType, RhsType> other_matrix;
+  };
+  CsrBlockSparseMatrix<WeightType, RhsType, DeltaType> sparse_matrix_;
+  CacheAlignedVector<BiasType> bias_;
+  CacheAlignedVector<BiasType> full_bias_;
+  // Output from the self_matrix that will be given to |other_matrix| as bias.
+  CacheAlignedVector<BiasType> mid_output_;
+  // One partitioned pair of matrices for each thread.
+  std::vector<PartLinearLayer> thread_layers_;
+  // Producer-consumer lock used to wait between computing |self_matrix| and
+  // |other_matrix| for the other threads to finish the *previous* matvec.
+  std::unique_ptr<ProducerConsumer> split_pc_;
+  int num_threads_ = 0;
+};
+template <typename WeightType, typename RhsType>
+SparseLinearLayer<WeightType, RhsType> CreateRandomLayer(int rows, int cols,
+                                                         float sparsity,
+                                                         int block_height = 1,
+                                                         int block_width = 1) {
+  typedef typename TypeOfProduct<WeightType, RhsType>::type BiasType;
+  CacheAlignedVector<BiasType> bias(rows);
+  bias.FillRandom();
+  auto masked_matrix = MaskedSparseMatrix<float>(rows, cols, sparsity,
+                                                 block_height, block_width);
+  auto sparse_matrix = CsrBlockSparseMatrix<WeightType, RhsType>(masked_matrix);
+  return SparseLinearLayer<WeightType, RhsType>(std::move(sparse_matrix),
+                                                std::move(bias));
+}
+template <typename WeightType, typename RhsType>
+SparseLinearLayer<WeightType, RhsType> CreateConstantLayer(
+    int rows, int cols, float sparsity, float constant = 1.f) {
+  typedef typename TypeOfProduct<WeightType, RhsType>::type BiasType;
+  CacheAlignedVector<BiasType> bias(rows);
+  bias.FillOnes();
+  MaskedSparseMatrix<float> masked_matrix(rows, cols, sparsity,
+                                          /*block_height=*/1, /*block_width=*/1,
+                                          constant, /*random=*/false);
+  CsrBlockSparseMatrix<WeightType, RhsType> sparse_matrix(masked_matrix);
+  return SparseLinearLayer<WeightType, RhsType>(std::move(sparse_matrix),
+                                                std::move(bias));
+}
+}  // namespace csrblocksparse
+#endif  // LYRA_CODEC_SPARSE_MATMUL_LAYERS_SPARSE_LINEAR_LAYER_H_

sparse_matmul/layers/sparse_linear_layer_test.cc ADDED Viewed

	@@ -0,0 +1,187 @@

+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "sparse_matmul/layers/sparse_linear_layer.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "sparse_matmul/numerics/test_utils.h"
+namespace csrblocksparse {
+namespace {
+constexpr int kBlockSize = 4;
+constexpr int kSize = 256;
+constexpr int kNumThreads = 4;
+constexpr int kCols = 1;
+void SlicedThreadBody(SpinBarrier* spin_barrier, int tid,
+                      const FatCacheAlignedVector<float>& rhs,
+                      SparseLinearLayer<float, float>* sparse_linear_layer,
+                      FatCacheAlignedVector<float>* out, bool use_relu) {
+  sparse_linear_layer->MatVec(rhs, use_relu, tid, /*replicas=*/1,
+                              /*output_stride=*/0, out);
+  spin_barrier->barrier();
+}
+// Tests that a Layer that has been SliceForThreads computes the same result as
+// the original layer. This is a basic test that all the slicing didn't mess up
+// any of the computations.
+TEST(CsrBlockSparseMatrix, SliceForThreads) {
+  MaskedSparseMatrix<float> matrix(kSize, kSize, 0.95, kBlockSize, kBlockSize);
+  FatCacheAlignedVector<float> rhs(kSize, kCols);
+  CacheAlignedVector<float> bias(kSize);
+  FatCacheAlignedVector<float> out1(kSize, kCols);
+  bias.FillRandom();
+  rhs.FillRandom();
+  out1.FillZero();
+  FatCacheAlignedVector<float> out_reference = out1;
+  CsrBlockSparseMatrix<float, float> sparse_matrix(matrix);
+  SparseLinearLayer<float, float> sparse_linear_layer(std::move(sparse_matrix),
+                                                      std::move(bias));
+  sparse_linear_layer.PrepareForThreads(1);
+  sparse_linear_layer.MatVec(rhs, /*relu=*/true, /*tid=*/0, /*replicas=*/1,
+                             /*output_stride=*/0, &out_reference);
+  std::vector<int> fake_split_points = {0, 48 / kBlockSize, 128 / kBlockSize,
+                                        208 / kBlockSize, kSize / kBlockSize};
+  sparse_linear_layer.PrepareForThreads(kNumThreads);
+  sparse_linear_layer.SliceForThreads(fake_split_points);
+  csrblocksparse::LaunchOnThreadsWithBarrier(kNumThreads, SlicedThreadBody, rhs,
+                                             &sparse_linear_layer, &out1,
+                                             /*relu=*/true);
+  CheckResult(out_reference, out1, kCols);
+}
+void LayersThreadBody(SpinBarrier* spin_barrier, int tid,
+                      const FatCacheAlignedVector<float>& rhs,
+                      SparseLinearLayer<float, float>* sparse_linear_layer1,
+                      SparseLinearLayer<float, float>* sparse_linear_layer2,
+                      FatCacheAlignedVector<float>* out1,
+                      FatCacheAlignedVector<float>* out2, bool use_relu) {
+  sparse_linear_layer1->MatVec(rhs, use_relu, tid, /*replicas=*/1,
+                               /*output_stride=*/0, out1);
+  // NOTE no barrier here!
+  sparse_linear_layer2->MatVec(*out1, use_relu, tid, /*replicas=*/1,
+                               /*output_stride=*/0, out2);
+  spin_barrier->barrier();
+}
+// Tests that a pair of layers computes the same result whether or not the
+// second layer has been SliceForThreads. This is a more critical test that
+// the replacement of barriers with producer-consumer locks works.
+// Must be run with tsan to really test it properly.
+TEST(CsrBlockSparseMatrix, SliceForThreadsLayers) {
+  MaskedSparseMatrix<float> matrix1(kSize, kSize, 0.95, kBlockSize, kBlockSize);
+  FatCacheAlignedVector<float> rhs(kSize, kCols);
+  CacheAlignedVector<float> bias1(kSize);
+  FatCacheAlignedVector<float> out1(kSize, kCols);
+  MaskedSparseMatrix<float> matrix2(kSize, kSize, 0.95, kBlockSize, kBlockSize);
+  CacheAlignedVector<float> bias2(kSize);
+  FatCacheAlignedVector<float> out2(kSize, kCols);
+  bias1.FillRandom();
+  rhs.FillRandom();
+  bias2.FillRandom();
+  out1.FillZero();
+  out2.FillZero();
+  FatCacheAlignedVector<float> out_reference = out2;
+  CsrBlockSparseMatrix<float, float> sparse_matrix1(matrix1);
+  SparseLinearLayer<float, float> layer1(std::move(sparse_matrix1),
+                                         std::move(bias1));
+  CsrBlockSparseMatrix<float, float> sparse_matrix2(matrix2);
+  SparseLinearLayer<float, float> layer2(std::move(sparse_matrix2),
+                                         std::move(bias2));
+  layer1.PrepareForThreads(1);
+  layer2.PrepareForThreads(1);
+  layer1.MatVec(rhs, /*relu=*/true, /*tid=*/0, /*replicas=*/1,
+                /*output_stride=*/0, &out1);
+  layer2.MatVec(out1, /*relu=*/true, /*tid=*/0, /*replicas=*/1,
+                /*output_stride=*/0, &out_reference);
+  layer1.PrepareForThreads(kNumThreads);
+  layer2.PrepareForThreads(kNumThreads);
+  layer2.SliceForThreads(layer1.split_points());
+  csrblocksparse::LaunchOnThreadsWithBarrier(kNumThreads, LayersThreadBody, rhs,
+                                             &layer1, &layer2, &out1, &out2,
+                                             /*relu=*/true);
+  CheckResult(out_reference, out2, kCols);
+}
+// Tests that a Layer that has been DoubleBlockHeight()-ed computes the same
+// result as original layer. (Float compute type).
+TEST(CsrBlockSparseMatrix, Float8x4) {
+  using ComputeType = float;
+  using RhsType = float;
+  using BiasType = float;
+  MaskedSparseMatrix<float> matrix(kSize, kSize, 0.95, kBlockSize, kBlockSize);
+  matrix.CastWeights<ComputeType>();
+  FatCacheAlignedVector<RhsType> rhs(kSize, kCols);
+  CacheAlignedVector<BiasType> bias(kSize);
+  FatCacheAlignedVector<BiasType> out1(kSize, kCols);
+  bias.FillRandom();
+  rhs.FillRandom();
+  out1.FillZero();
+  FatCacheAlignedVector<BiasType> out_reference = out1;
+  CsrBlockSparseMatrix<ComputeType, RhsType> sparse_matrix(matrix);
+  SparseLinearLayer<ComputeType, RhsType> sparse_linear_layer(
+      std::move(sparse_matrix), std::move(bias));
+  sparse_linear_layer.PrepareForThreads(1);
+  sparse_linear_layer.MatVec(rhs, /*relu=*/true, /*tid=*/0, /*replicas=*/1,
+                             /*output_stride=*/0, &out_reference);
+  sparse_linear_layer.DoubleBlockHeight();
+  sparse_linear_layer.PrepareForThreads(1);
+  sparse_linear_layer.MatVec(rhs, /*relu=*/true, /*tid=*/0, /*replicas=*/1,
+                             /*output_stride=*/0, &out1);
+  CheckResult(out_reference, out1, kCols);
+}
+// Tests that a Layer that has been DoubleBlockHeight()-ed computes the same
+// result as original layer. (Fixed16 compute type).
+TEST(CsrBlockSparseMatrix, Fixed8x4) {
+  using ComputeType = csrblocksparse::fixed16<4>;
+  using RhsType = csrblocksparse::fixed16<4>;
+  using BiasType = typename TypeOfProduct<ComputeType, RhsType>::type;
+  MaskedSparseMatrix<float> matrix(kSize, kSize, 0.95, kBlockSize, kBlockSize);
+  matrix.CastWeights<ComputeType>();
+  FatCacheAlignedVector<RhsType> rhs(kSize, kCols);
+  CacheAlignedVector<BiasType> bias(kSize);
+  FatCacheAlignedVector<BiasType> out1(kSize, kCols);
+  bias.FillRandom();
+  rhs.FillRandom();
+  out1.FillZero();
+  FatCacheAlignedVector<BiasType> out_reference = out1;
+  CsrBlockSparseMatrix<ComputeType, RhsType> sparse_matrix(matrix);
+  SparseLinearLayer<ComputeType, RhsType> sparse_linear_layer(
+      std::move(sparse_matrix), std::move(bias));
+  sparse_linear_layer.PrepareForThreads(1);
+  sparse_linear_layer.MatVec(rhs, /*relu=*/false, /*tid=*/0, /*replicas=*/1,
+                             /*output_stride=*/0, &out_reference);
+  sparse_linear_layer.DoubleBlockHeight();
+  sparse_linear_layer.PrepareForThreads(1);
+  sparse_linear_layer.MatVec(rhs, /*relu=*/false, /*tid=*/0, /*replicas=*/1,
+                             /*output_stride=*/0, &out1);
+  CheckResult(out_reference, out1, kCols);
+}
+TEST(SparseLinearLayerTest, PrintCompiles) {
+  SparseLinearLayer<float, float> sparse_linear_layer;
+  sparse_linear_layer.Print();
+}
+}  // namespace
+}  // namespace csrblocksparse

sparse_matmul/layers/status_macros.h ADDED Viewed

	@@ -0,0 +1,34 @@

+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef THIRD_PARTY_LYRA_CODEC_SPARSE_MATMUL_LAYERS_STATUS_MACROS_H_
+#define THIRD_PARTY_LYRA_CODEC_SPARSE_MATMUL_LAYERS_STATUS_MACROS_H_
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#define SPARSE_MATMUL_RETURN_IF_ERROR(expr) \
+  do {                                      \
+    const absl::Status _status = (expr);    \
+    if (!_status.ok()) return _status;      \
+  } while (0)
+template <typename T>
+absl::Status DoAssignOrReturn(T& lhs, absl::StatusOr<T> result) {
+  if (result.ok()) {
+    lhs = result.value();
+  }
+  return result.status();
+}
+#endif  // THIRD_PARTY_LYRA_CODEC_SPARSE_MATMUL_LAYERS_STATUS_MACROS_H_

sparse_matmul/layers/testdata/768_512_95_4x4_QRhat_weights.raw.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:50f861af29b1f767830d74ef83874944b18d80157b6b0256fdc4c14fa79ec936
+size 20852

sparse_matmul/layers/testdata/768_512_95_4x4_What_weights.raw.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a2d534bde2caf6e59990a46b4b1907088b8144c53d62d97de7e2b4bdc956da68
+size 5133

sparse_matmul/layers/testdata/768_512_95_4x4_coarselogit_bias.raw.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:11399f9d0e8f8dfbef6eb37e0c096f858658bc650f728a08f3135ccca44f0a5a
+size 1062

sparse_matmul/layers/testdata/768_512_95_4x4_coarselogit_mask.raw.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d3d971e067a6df985d68beac26bcf4e9a6cc13ff328599e84d50a0fc9a7c103b
+size 2382

sparse_matmul/layers/testdata/768_512_95_4x4_coarselogit_weights.raw.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d1376ef7a360699dae24a49f40a254990d4a70b844dadcdbe9dcbf1a306999a8
+size 55829

sparse_matmul/layers/testdata/768_512_95_4x4_coarseproj_bias.raw.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ffcc8ccf086fccfacc928877aa29ef03ce51cce0f0b7d2aacf81782b7b527089
+size 2003

sparse_matmul/layers/testdata/768_512_95_4x4_coarseproj_mask.raw.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7a16f98ba6f09031ea9fefb79fdc9ba90e44f0046ab70dab014ac971ca7f7186
+size 4684