From 2452cae043c9a4377f86852f510e0a7d3dc3d75d Mon Sep 17 00:00:00 2001 From: Fynn Petersen-Frey Date: Thu, 30 Nov 2023 23:26:08 +0100 Subject: [PATCH] feat(ml): ARMNN acceleration --- docker/mlaccel-armnn.yml | 11 ++ machine-learning/Dockerfile | 26 ++- machine-learning/ann/ann.cpp | 196 +++++++++++++++++++++ machine-learning/ann/ann.py | 124 +++++++++++++ machine-learning/ann/build.sh | 1 + machine-learning/export/.gitignore | 1 + machine-learning/export/build-converter.sh | 4 + machine-learning/export/download-armnn.sh | 8 + machine-learning/export/env.yaml | 1 + machine-learning/export/tiny.py | 80 +++++++++ 10 files changed, 450 insertions(+), 2 deletions(-) create mode 100644 docker/mlaccel-armnn.yml create mode 100644 machine-learning/ann/ann.cpp create mode 100644 machine-learning/ann/ann.py create mode 100644 machine-learning/ann/build.sh create mode 100644 machine-learning/export/.gitignore create mode 100755 machine-learning/export/build-converter.sh create mode 100755 machine-learning/export/download-armnn.sh create mode 100644 machine-learning/export/tiny.py diff --git a/docker/mlaccel-armnn.yml b/docker/mlaccel-armnn.yml new file mode 100644 index 000000000..3be6487db --- /dev/null +++ b/docker/mlaccel-armnn.yml @@ -0,0 +1,11 @@ +version: "3.8" + +# ML acceleration on supported Mali ARM GPUs using ARM-NN + +services: + mlaccel: + devices: + - /dev/mali0:/dev/mali0 + volumes: + - /lib/firmware/mali_csffw.bin:/lib/firmware/mali_csffw.bin:ro # Mali firmware for your chipset + - /usr/lib/libmali-valhall-g610-g6p0-gbm.so:/usr/lib/libmali.so:ro # Mali driver for you chipset diff --git a/machine-learning/Dockerfile b/machine-learning/Dockerfile index 8d7f400d9..843d1d681 100644 --- a/machine-learning/Dockerfile +++ b/machine-learning/Dockerfile @@ -13,20 +13,42 @@ ENV VIRTUAL_ENV="/opt/venv" PATH="/opt/venv/bin:${PATH}" COPY poetry.lock pyproject.toml ./ RUN poetry install --sync --no-interaction --no-ansi --no-root --only main -FROM python:3.11-slim-bookworm@sha256:1bc6a3e9356d64ea632791653bc71a56340e8741dab66434ab2739ebf6aed29d +ARG TARGETPLATFORM +ENV ARMNN_PATH=/opt/armnn +COPY ann /opt/ann +RUN test "$TARGETPLATFORM = linux/arm64" && \ + mkdir /opt/armnn && \ + curl -SL "https://github.com/ARM-software/armnn/releases/download/v23.11/ArmNN-linux-aarch64.tar.gz" | tar -zx -C /opt/armnn && \ + cd /opt/ann && \ + sh build.sh + +FROM python:3.11-slim-bookworm@sha256:1bc6a3e9356d64ea632791653bc71a56340e8741dab66434ab2739ebf6aed29d +ARG TARGETPLATFORM RUN apt-get update && apt-get install -y --no-install-recommends tini libmimalloc2.0 && rm -rf /var/lib/apt/lists/* +RUN test "$TARGETPLATFORM = linux/arm64" && \ + apt-get update && apt-get install -y --no-install-recommends ocl-icd-libopencl1 mesa-opencl-icd && \ + rm -rf /var/lib/apt/lists/* && \ + mkdir --parents /etc/OpenCL/vendors && \ + echo "/usr/lib/libmali.so" > /etc/OpenCL/vendors/mali.icd && \ + mkdir /opt/armnn && \ + mkdir /opt/ann + WORKDIR /usr/src/app ENV NODE_ENV=production \ TRANSFORMERS_CACHE=/cache \ PYTHONDONTWRITEBYTECODE=1 \ PYTHONUNBUFFERED=1 \ PATH="/opt/venv/bin:$PATH" \ - PYTHONPATH=/usr/src + PYTHONPATH=/usr/src \ + LD_LIBRARY_PATH=/opt/armnn COPY --from=builder /opt/venv /opt/venv +COPY --from=builder /opt/armnn/libarmnn.so.?? /opt/armnn/libarmnnOnnxParser.so.?? /opt/armnn/libarmnnDeserializer.so.?? /opt/armnn/libarmnnTfLiteParser.so.?? /opt/armnn/libprotobuf.so.?.??.?.? /opt/ann/libann.s[o] /opt/armnn +COPY ann/ann.py /usr/src/ann/ann.py COPY start.sh log_conf.json ./ COPY app . + ENTRYPOINT ["tini", "--"] CMD ["./start.sh"] diff --git a/machine-learning/ann/ann.cpp b/machine-learning/ann/ann.cpp new file mode 100644 index 000000000..40b1b9193 --- /dev/null +++ b/machine-learning/ann/ann.cpp @@ -0,0 +1,196 @@ +#include + +#include "armnn/IRuntime.hpp" +#include "armnn/INetwork.hpp" +#include "armnn/Types.hpp" +#include "armnnDeserializer/IDeserializer.hpp" +#include "armnnTfLiteParser/ITfLiteParser.hpp" +#include "armnnOnnxParser/IOnnxParser.hpp" + +using namespace armnn; + +class Ann +{ + +public: + int load(const char *modelPath, const char *inputName, const char *outputName, bool fastMath, bool saveCachedNetwork, const char *cachedNetworkPath) + { + BindingPointInfo inputInfo; + BindingPointInfo outputInfo; + INetworkPtr network = loadModel(modelPath, inputName, outputName, inputInfo, outputInfo); + + auto n = network.get(); + + IOptimizedNetworkPtr optNet = OptimizeNetwork(n, fastMath, saveCachedNetwork, cachedNetworkPath); + NetworkId netId; + Status status = runtime->LoadNetwork(netId, std::move(optNet)); + inputInfos[netId] = inputInfo; + outputInfos[netId] = outputInfo; + return netId; + } + + void embed(NetworkId netId, const void *inputData, void *outputData) + { + const BindingPointInfo *inputInfo = &inputInfos[netId]; + const BindingPointInfo *outputInfo = &outputInfos[netId]; + InputTensors inputTensors = {{inputInfo->first, ConstTensor{inputInfo->second, inputData}}}; + OutputTensors outputTensors = {{outputInfo->first, armnn::Tensor{outputInfo->second, outputData}}}; + runtime->EnqueueWorkload(netId, inputTensors, outputTensors); + } + + void unload(NetworkId netId) + { + runtime->UnloadNetwork(netId); + } + + unsigned long shape(NetworkId netId, bool isInput) + { + const TensorShape shape = (isInput ? inputInfos : outputInfos)[netId].second.GetShape(); + unsigned long s = 0; + for (unsigned int d = 0; d < shape.GetNumDimensions(); d++) + s |= ((unsigned long)shape[d]) << (d * 16); // stores up to 4 16-bit values in a 64-bit value + return s; + } + + Ann(int tuningLevel, const char *tuningFile) + { + IRuntime::CreationOptions runtimeOptions; + BackendOptions backendOptions{"GpuAcc", + { + {"TuningLevel", tuningLevel}, + {"MemoryOptimizerStrategy", "ConstantMemoryStrategy"}, // SingleAxisPriorityList or ConstantMemoryStrategy + }}; + if (tuningFile) + backendOptions.AddOption({"TuningFile", tuningFile}); + runtimeOptions.m_BackendOptions.emplace_back(backendOptions); + runtime = IRuntime::CreateRaw(runtimeOptions); + }; + ~Ann() + { + IRuntime::Destroy(runtime); + }; + +private: + INetworkPtr loadModel(const char *modelPath, const char *inputName, const char *outputName, BindingPointInfo &inputInfo, BindingPointInfo &outputInfo) + { + const auto path = std::string(modelPath); + if (path.rfind(".tflite") == path.length() - 7) // endsWith() + { + auto parser = armnnTfLiteParser::ITfLiteParser::CreateRaw(); + INetworkPtr network = parser->CreateNetworkFromBinaryFile(modelPath); + auto inputBinding = parser->GetNetworkInputBindingInfo(0, inputName); + inputInfo = getInputTensorInfo(inputBinding.first, inputBinding.second); + outputInfo = parser->GetNetworkOutputBindingInfo(0, outputName); + return network; + } + else if (path.rfind(".onnx") == path.length() - 5) // endsWith() + { + auto parser = armnnOnnxParser::IOnnxParser::CreateRaw(); + INetworkPtr network = parser->CreateNetworkFromBinaryFile(modelPath); + auto inputBinding = parser->GetNetworkInputBindingInfo(inputName); + inputInfo = getInputTensorInfo(inputBinding.first, inputBinding.second); + outputInfo = parser->GetNetworkOutputBindingInfo(outputName); + return network; + } + else + { + std::ifstream ifs(path, std::ifstream::in | std::ifstream::binary); + auto parser = armnnDeserializer::IDeserializer::CreateRaw(); + INetworkPtr network = parser->CreateNetworkFromBinary(ifs); + auto inputBinding = parser->GetNetworkInputBindingInfo(0, inputName); + inputInfo = getInputTensorInfo(inputBinding.m_BindingId, inputBinding.m_TensorInfo); + auto outputBinding = parser->GetNetworkOutputBindingInfo(0, outputName); + outputInfo = {outputBinding.m_BindingId, outputBinding.m_TensorInfo}; + return network; + } + } + + BindingPointInfo getInputTensorInfo(LayerBindingId inputBindingId, TensorInfo &info) + { + const auto newInfo = TensorInfo{info.GetShape(), info.GetDataType(), + info.GetQuantizationScale(), + info.GetQuantizationOffset(), + true}; + return {inputBindingId, newInfo}; + } + + IOptimizedNetworkPtr OptimizeNetwork(INetwork *network, bool fastMath, bool saveCachedNetwork, const char *cachedNetworkPath) + { + const bool allowExpandedDims = false; + const ShapeInferenceMethod shapeInferenceMethod = ShapeInferenceMethod::ValidateOnly; + + OptimizerOptionsOpaque options; + options.SetReduceFp32ToFp16(false); + options.SetShapeInferenceMethod(shapeInferenceMethod); + options.SetAllowExpandedDims(allowExpandedDims); + + BackendOptions gpuAcc("GpuAcc", {{"FastMathEnabled", fastMath}}); + if (cachedNetworkPath) + { + gpuAcc.AddOption({"SaveCachedNetwork", saveCachedNetwork}); + gpuAcc.AddOption({"CachedNetworkFilePath", cachedNetworkPath}); + } + options.AddModelOption(gpuAcc); + + // No point in using ARMNN for CPU, use ONNX instead. + // BackendOptions cpuAcc("CpuAcc", + // { + // {"FastMathEnabled", true}, + // {"NumberOfThreads", 0}, + // }); + // options.AddModelOption(cpuAcc); + + BackendOptions allowExDimOpt("AllowExpandedDims", + {{"AllowExpandedDims", allowExpandedDims}}); + options.AddModelOption(allowExDimOpt); + BackendOptions shapeInferOpt("ShapeInferenceMethod", + {{"InferAndValidate", shapeInferenceMethod == ShapeInferenceMethod::InferAndValidate}}); + options.AddModelOption(shapeInferOpt); + + std::vector backends = {BackendId("GpuAcc")}; + return Optimize(*network, backends, runtime->GetDeviceSpec(), options); + } + IRuntime *runtime; + std::map inputInfos; + std::map outputInfos; +}; + +extern "C" void *init(int logLevel, int tuningLevel, const char *tuningFile) +{ + LogSeverity level = static_cast(logLevel); + ConfigureLogging(true, true, level); + + Ann *ann = new Ann(tuningLevel, tuningFile); + return ann; +} + +extern "C" void destroy(void *ann) +{ + delete ((Ann *)ann); +} + +extern "C" int load(void *ann, + const char *path, + const char *inputName, + const char *ouputName, + bool fastMath, + bool saveCachedNetwork, + const char *cachedNetworkPath) +{ + return ((Ann *)ann)->load(path, inputName, ouputName, fastMath, saveCachedNetwork, cachedNetworkPath); +} + +extern "C" void unload(void *ann, NetworkId netId) +{ + ((Ann *)ann)->unload(netId); +} + +extern "C" void embed(void *ann, NetworkId netId, void *inputData, void *outputData) +{ + ((Ann *)ann)->embed(netId, inputData, outputData); +} + +extern "C" unsigned long shape(void *ann, NetworkId netId, bool isInput) +{ + return ((Ann *)ann)->shape(netId, isInput); +} \ No newline at end of file diff --git a/machine-learning/ann/ann.py b/machine-learning/ann/ann.py new file mode 100644 index 000000000..8238987b5 --- /dev/null +++ b/machine-learning/ann/ann.py @@ -0,0 +1,124 @@ +import time +from ctypes import CDLL, c_bool, c_char_p, c_int, c_ulong, c_void_p +from os.path import exists +from typing import Dict, Tuple + +import numpy as np +from numpy.typing import NDArray + +libann = CDLL("libann.so") +libann.init.argtypes = c_int, c_int, c_char_p +libann.init.restype = c_void_p +libann.load.argtypes = c_void_p, c_char_p, c_char_p, c_char_p, c_bool, c_bool, c_char_p +libann.load.restype = c_int +libann.embed.argtypes = c_void_p, c_int, c_void_p, c_void_p +libann.unload.argtypes = c_void_p, c_int +libann.destroy.argtypes = (c_void_p,) +libann.shape.argtypes = (c_void_p, c_int, c_bool) +libann.shape.restype = c_ulong + + +class Ann: + def __init__(self, log_level=3, tuning_level=1, tuning_file: str = None) -> None: + if tuning_file and not exists(tuning_file): + raise ValueError("tuning_file must point to an existing (possibly empty) file!") + if tuning_level == 0 and tuning_file is None: + raise ValueError("tuning_level == 0 reads existing tuning information and requires a tuning_file") + if tuning_level < 0 or tuning_level > 3: + raise ValueError("tuning_level must be 0 (load from tuning_file), 1, 2 or 3.") + if log_level < 0 or log_level > 5: + raise ValueError("log_level must be 0 (trace), 1 (debug), 2 (info), 3 (warning), 4 (error) or 5 (fatal)") + self.ann = libann.init(log_level, tuning_level, tuning_file.encode("utf-8") if tuning_file else None) + self.output_shapes: Dict[int, Tuple[int, ...]] = {} + self.input_shapes: Dict[int, Tuple[int, ...]] = {} + + def __del__(self) -> None: + libann.destroy(self.ann) + + def load( + self, + model_path: str, + input_name="input_tensor", + output_name="output_tensor", + fast_math=True, + save_cached_network=False, + cached_network_path: str = None, + ) -> int: + if not (exists(model_path) and model_path.endswith((".armnn", ".tflite", ".onnx"))): + raise ValueError("model_path must be a file with extension .armnn, .tflite or .onnx") + if cached_network_path and not exists(cached_network_path): + raise ValueError("cached_network_path must point to an existing (possibly empty) file!") + if save_cached_network and cached_network_path is None: + raise ValueError("save_cached_network is True, cached_network_path must be specified!") + net_id = libann.load( + self.ann, + model_path.encode("utf-8"), + input_name.encode("utf-8"), + output_name.encode("utf-8"), + fast_math, + save_cached_network, + cached_network_path.encode("utf-8") if cached_network_path else None, + ) + + self.input_shapes[net_id] = self.shape(net_id, input=True) + self.output_shapes[net_id] = self.shape(net_id, input=False) + return net_id + + def unload(self, network_id: int) -> None: + libann.unload(self.ann, network_id) + del self.output_shapes[network_id] + + def embed(self, network_id: int, input_tensor: NDArray) -> NDArray: + net_input_shape = self.input_shapes[network_id] + if input_tensor.shape != net_input_shape: + raise ValueError(f"input_tensor shape {input_tensor.shape} != network input shape {net_input_shape}") + output_tensor = np.ndarray(self.output_shapes[network_id], dtype=np.float32) + libann.embed( + self.ann, network_id, input_tensor.ctypes.data_as(c_void_p), output_tensor.ctypes.data_as(c_void_p) + ) + return output_tensor + + def shape(self, network_id: int, input=False) -> Tuple[int]: + s = libann.shape(self.ann, network_id, input) + a = [] + while s != 0: + a.append(s & 0xFFFF) + s >>= 16 + return tuple(a) + + +def test(): + iterations = 1 + start = time.perf_counter_ns() + ann = Ann(tuning_level=0, tuning_file="gpu.tuning") + net = ann.load("/tmp/tiny-clip-b1-fp16.armnn", save_cached_network=False, cached_network_path="cached.network") + end = time.perf_counter_ns() + # cached_network_path saves 1.2 seconds + print("loading took ", (end - start) / 1000000) + img = np.load("/tmp/img.npy") + # img = np.repeat(img, 2, 0) + + start = time.perf_counter_ns() + # warmup + dummy = np.ndarray(ann.shape(net, input=True), dtype=np.float32) + ann.embed(net, dummy) + end = time.perf_counter_ns() + # tuning_file saves 18 seconds for tuning level 3 + print("warmup took ", (end - start) / 1000000) + + start = time.perf_counter_ns() + for i in range(iterations): + embedding = ann.embed(net, img) + end = time.perf_counter_ns() + per_sample = (end - start) / (1000000 * iterations) + + # print(embedding) + # np.save("/tmp/ann_fp16.npy", embedding) + print("embedding took ", per_sample) + + ann.unload(net) + del ann # important to save tuning file + + +if __name__ == "__main__": + test() diff --git a/machine-learning/ann/build.sh b/machine-learning/ann/build.sh new file mode 100644 index 000000000..d90fa1ae1 --- /dev/null +++ b/machine-learning/ann/build.sh @@ -0,0 +1 @@ +g++ -shared -O3 -o libann.so -fuse-ld=gold -std=c++17 -I$ARMNN_PATH/include -larmnn -larmnnDeserializer -larmnnTfLiteParser -larmnnOnnxParser -L$ARMNN_PATH ann.cpp diff --git a/machine-learning/export/.gitignore b/machine-learning/export/.gitignore new file mode 100644 index 000000000..19017c1a6 --- /dev/null +++ b/machine-learning/export/.gitignore @@ -0,0 +1 @@ +armnn* diff --git a/machine-learning/export/build-converter.sh b/machine-learning/export/build-converter.sh new file mode 100755 index 000000000..a727d791b --- /dev/null +++ b/machine-learning/export/build-converter.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +cd armnn-23.11/ +g++ -o ../armnnconverter -O1 -DARMNN_ONNX_PARSER -DARMNN_SERIALIZER -DARMNN_TF_LITE_PARSER -fuse-ld=gold -std=c++17 -Iinclude -Isrc/armnnUtils -Ithird-party -larmnn -larmnnDeserializer -larmnnTfLiteParser -larmnnOnnxParser -larmnnSerializer -L../armnn src/armnnConverter/ArmnnConverter.cpp \ No newline at end of file diff --git a/machine-learning/export/download-armnn.sh b/machine-learning/export/download-armnn.sh new file mode 100755 index 000000000..886938ee7 --- /dev/null +++ b/machine-learning/export/download-armnn.sh @@ -0,0 +1,8 @@ +#!/bin/sh + +# binaries +mkdir armnn +curl -SL "https://github.com/ARM-software/armnn/releases/download/v23.11/ArmNN-linux-aarch64.tar.gz" | tar -zx -C armnn + +# source to build ArmnnConverter +curl -SL "https://github.com/ARM-software/armnn/archive/refs/tags/v23.11.tar.gz" | tar -zx \ No newline at end of file diff --git a/machine-learning/export/env.yaml b/machine-learning/export/env.yaml index f7144812d..b4e313e36 100644 --- a/machine-learning/export/env.yaml +++ b/machine-learning/export/env.yaml @@ -22,4 +22,5 @@ dependencies: - pip: - multilingual-clip - onnx-simplifier + - git+https://github.com/fyfrey/TinyNeuralNetwork.git category: main diff --git a/machine-learning/export/tiny.py b/machine-learning/export/tiny.py new file mode 100644 index 000000000..04134562d --- /dev/null +++ b/machine-learning/export/tiny.py @@ -0,0 +1,80 @@ +import logging +import os +import platform +import subprocess + +import open_clip +import torch +from tinynn.converter import TFLiteConverter + + +class Wrapper(torch.nn.Module): + def __init__(self, device: torch.device): + super().__init__() + self.device = device + self.model = open_clip.create_model( + "ViT-B-32", + pretrained="openai", + precision="fp16" if device.type == "cuda" else "fp32", + jit=False, + require_pretrained=True, + device=device, + ) + + def forward(self, input_tensor: torch.FloatTensor): + embedding = self.model.encode_image(input_tensor.half() if self.device.type == "cuda" else input_tensor) + return embedding.float() + + +def main(): + if platform.machine() not in ("x86_64", "AMD64"): + raise RuntimeError(f"Can only run on x86_64 / AMD64, not {platform.machine()}") + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + if device.type != "cuda": + logging.warning("No CUDA available, cannot create fp16 model! " + "proceeding to create a fp32 model (use only for testing)") + + model = Wrapper(device) + model = model.to(device) + for param in model.parameters(): + param.requires_grad = False + model.eval() + + dummy_input = torch.rand((1, 3, 224, 224)) + dummy_input = dummy_input.to(device) + + dummy_out = model(dummy_input) + print(dummy_out.dtype, dummy_out.device, dummy_out.shape) + + jit = torch.jit.trace(model, dummy_input) + output_name = "output_tensor" + list(jit.graph.outputs())[0].setDebugName(output_name) + tflite_model_path = "tiny-clip.tflite" + output_path = os.path.join("out", tflite_model_path) + + converter = TFLiteConverter(jit, dummy_input, output_path, nchw_transpose=True) + # segfaults on ARM, must run on x86_64 / AMD64 + converter.convert() + + armnn_model_path = "tiny-clip.armnn" + os.environ.LD_LIBRARY_PATH = "armnn" + subprocess.run( + [ + "./ArmnnConverter", + "-f", + "tflite-binary", + "-m", + tflite_model_path, + "-i", + "input_tensor", + "-o", + "output_name", + "-p", + armnn_model_path, + ] + ) + +if __name__ == "__main__": + with torch.no_grad(): + main()