From 8d7c827ef6a1d248fff425da2c209d92f04e888a Mon Sep 17 00:00:00 2001 From: Tobiszewski Date: Mon, 8 Jun 2026 13:15:37 +0200 Subject: [PATCH 1/3] IntegrateGStreamer library for HW-accelerated audio decode/encode - Add src/mpi/: Intel MPI C API (intel_mpi.h, imp_mpi_impl.h, intel_mpi.cpp) and OVMS adapter layer (imp_audio_utils.hpp/cpp) - Add third_party/gstreamer/: Bazel BUILD for pre-installed GStreamer MSVC - windows_build.bat, windows_test.bat: add GStreamer bin dir to PATH The MPI library wraps GStreamer pipelines to provide HW-accelerated audio decode (readAudio) and encode (writeAudio) supporting WAV, MP3, FLAC, Opus, AAC and OGG. Upper layers call only through the MPI C API; GStreamer is an implementation detail hidden inside intel_mpi.cpp. On Linux the imp_audio_utils stubs return false, falling back to dr_libs. --- src/mpi/BUILD | 89 + src/mpi/imp_audio_utils.cpp | 230 +++ src/mpi/imp_audio_utils.hpp | 98 ++ src/mpi/imp_mpi_impl.h | 286 +++ src/mpi/intel_mpi.cpp | 1531 +++++++++++++++++ src/mpi/intel_mpi.h | 1013 +++++++++++ third_party/gstreamer/BUILD | 2 + third_party/gstreamer/gstreamer_windows.BUILD | 131 ++ windows_build.bat | 12 +- windows_test.bat | 2 +- 10 files changed, 3383 insertions(+), 11 deletions(-) create mode 100644 src/mpi/BUILD create mode 100644 src/mpi/imp_audio_utils.cpp create mode 100644 src/mpi/imp_audio_utils.hpp create mode 100644 src/mpi/imp_mpi_impl.h create mode 100644 src/mpi/intel_mpi.cpp create mode 100644 src/mpi/intel_mpi.h create mode 100644 third_party/gstreamer/BUILD create mode 100644 third_party/gstreamer/gstreamer_windows.BUILD diff --git a/src/mpi/BUILD b/src/mpi/BUILD new file mode 100644 index 0000000000..3239c5f622 --- /dev/null +++ b/src/mpi/BUILD @@ -0,0 +1,89 @@ +# +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +load("//:common_settings.bzl", "COMMON_LOCAL_DEFINES", "ovms_cc_library") + +package(default_visibility = ["//visibility:public"]) + +# Intel Media Processing Interface (Intel MPI) library +# Windows-only for now — wraps GStreamer for HW-accelerated media decode/encode +# with zero-copy OpenVINO tensor integration. +# +# On Linux this target is a no-op (empty lib) so dependents can +# unconditionally list it without breaking the build. + +cc_library( + name = "intel_mpi", + srcs = select({ + "//src:windows": ["intel_mpi.cpp"], + "//conditions:default": [], + }), + hdrs = select({ + "//src:windows": [ + "intel_mpi.h", + "imp_mpi_impl.h", + ], + "//conditions:default": ["intel_mpi.h"], + }), + copts = select({ + "//src:windows": [ + "/std:c++17", + "/EHsc", + "/MD", + "/DGST_USE_UNSTABLE_API", + "/external:anglebrackets", + "/external:W0", + "/wd4005", + "/wd4100", + "/wd4267", + "/wd4244", + "/wd4996", + ], + "//conditions:default": [], + }), + local_defines = COMMON_LOCAL_DEFINES + select({ + "//src:windows": ["INTEL_MPI_AVAILABLE=1"], + "//conditions:default": [], + }), + deps = select({ + "//src:windows": [ + "//third_party:openvino", + "//third_party:gstreamer", + ], + "//conditions:default": [], + }), + visibility = ["//visibility:public"], +) + +# Thin audio-decode/encode utilities that wrap Intel MPI for OVMS calculators. +# On Linux this is a stub (all functions return false → callers fall back to dr_libs). +ovms_cc_library( + name = "imp_audio_utils", + srcs = ["imp_audio_utils.cpp"], + hdrs = ["imp_audio_utils.hpp"], + local_defines = COMMON_LOCAL_DEFINES + select({ + "//src:windows": ["INTEL_MPI_AVAILABLE=1"], + "//conditions:default": [], + }), + deps = select({ + "//src:windows": [ + ":intel_mpi", + "//third_party:openvino", + ], + "//conditions:default": [], + }), + visibility = ["//visibility:public"], +) diff --git a/src/mpi/imp_audio_utils.cpp b/src/mpi/imp_audio_utils.cpp new file mode 100644 index 0000000000..44e95e8b33 --- /dev/null +++ b/src/mpi/imp_audio_utils.cpp @@ -0,0 +1,230 @@ +//***************************************************************************** +// Copyright 2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** + +/** + * @file imp_audio_utils.cpp + * @brief Implementation of Intel MPI audio helpers for OVMS. + * + * All media operations go through the Intel MPI C API (intel_mpi.h). + * This file does NOT call GStreamer directly — the MPI library owns + * that abstraction. + * + * Two compile-time paths: + * 1. INTEL_MPI_AVAILABLE — calls imp_decode_audio() etc. + * 2. Fallback — all functions return false / no-op + */ + +#include "imp_audio_utils.hpp" + +#ifdef INTEL_MPI_AVAILABLE + +#include "intel_mpi.h" // MPI C API + +#include // ov::Tensor for data extraction +#include +#include + +namespace ovms { +namespace imp { + +bool isAvailable() { + // imp_get_version succeeds when the library is linked in. + // A more thorough check could try creating a dummy context, + // but version presence is enough to confirm linkage. + int major = 0, minor = 0, patch = 0; + imp_get_version(&major, &minor, &patch); + return (major > 0 || minor > 0 || patch > 0); +} + +bool decodeAudioBuffer(const std::string_view& audioData, + uint32_t sampleRate, + std::vector& pcm) { + if (audioData.empty()) return false; + + // Configure decode options + imp_audio_decode_opts_t opts{}; + opts.sample_rate = sampleRate; + opts.channels = 1; // mono output for speech models + opts.output_type = IMP_TYPE_FP32; + opts.normalize = true; + + // Decode via Intel MPI (context = NULL → no GPU, pure media decode) + imp_tensor_t* tensor = nullptr; + imp_status_t status = imp_decode_audio( + &tensor, + audioData.data(), + audioData.size(), + nullptr, // ctx — not needed for CPU audio decode + &opts, + nullptr, // callback — synchronous + nullptr); // user_data + + if (status != IMP_OK || !tensor) { + if (tensor) imp_tensor_release(tensor); + return false; + } + + // Extract shape → number of float samples + int64_t dims[4] = {}; + size_t ndims = 4; + imp_tensor_get_shape(tensor, dims, &ndims); + + size_t numSamples = 1; + for (size_t i = 0; i < ndims; i++) { + numSamples *= static_cast(dims[i]); + } + + // Get pointer to underlying ov::Tensor data + void* ov_tensor_ptr = nullptr; + imp_device_type_t dev = IMP_DEVICE_CPU; + imp_tensor_get_ov(tensor, &ov_tensor_ptr, &dev); + + if (!ov_tensor_ptr || dev != IMP_DEVICE_CPU) { + imp_tensor_release(tensor); + return false; + } + + // The ov_tensor_ptr is an ov::Tensor* — extract float data + auto* ovt = reinterpret_cast(ov_tensor_ptr); + const float* fdata = ovt->data(); + + pcm.assign(fdata, fdata + numSamples); + + // Release the tensor through the MPI C API. + imp_tensor_release(tensor); + + return !pcm.empty(); +} + +bool encodeToWav(const std::vector& pcm, + uint32_t sampleRate, + uint16_t bitsPerSample, + std::vector& wavData) { + // WAV encoding is trivial — no need to route through GStreamer. + // Keep this self-contained for minimal overhead. + if (pcm.empty()) return false; + (void)bitsPerSample; // always output 32-bit float WAV + + uint32_t numChannels = 1; + uint32_t byteRate = sampleRate * numChannels * sizeof(float); + uint32_t blockAlign = numChannels * sizeof(float); + uint32_t dataSize = static_cast(pcm.size() * sizeof(float)); + uint32_t chunkSize = 36 + dataSize; + + wavData.clear(); + wavData.reserve(44 + dataSize); + + auto write16 = [&](uint16_t v) { wavData.push_back(v & 0xFF); wavData.push_back((v >> 8) & 0xFF); }; + auto write32 = [&](uint32_t v) { + wavData.push_back(v & 0xFF); wavData.push_back((v >> 8) & 0xFF); + wavData.push_back((v >> 16) & 0xFF); wavData.push_back((v >> 24) & 0xFF); + }; + auto writeStr = [&](const char* s, size_t n) { wavData.insert(wavData.end(), s, s + n); }; + + writeStr("RIFF", 4); + write32(chunkSize); + writeStr("WAVE", 4); + writeStr("fmt ", 4); + write32(16); // subchunk1 size + write16(3); // IEEE float format + write16(static_cast(numChannels)); + write32(sampleRate); + write32(byteRate); + write16(static_cast(blockAlign)); + write16(32); // bits per sample + writeStr("data", 4); + write32(dataSize); + + const uint8_t* raw = reinterpret_cast(pcm.data()); + wavData.insert(wavData.end(), raw, raw + dataSize); + + return true; +} + +bool encodeAudioBuffer(const float* samples, + size_t numSamples, + uint32_t sampleRate, + uint16_t bitsPerSample, + const std::string& format, + std::vector& output) { + if (!samples || numSamples == 0) return false; + + // For WAV we can use the local writer (fast, no GStreamer init) + if (format == "wav") { + std::vector pcm(samples, samples + numSamples); + return encodeToWav(pcm, sampleRate, bitsPerSample, output); + } + + // Route everything else through the MPI C API + imp_audio_encode_opts_t opts{}; + opts.codec = format.c_str(); + opts.sample_rate = sampleRate; + opts.channels = 1; + opts.bitrate_kbps = 192; // reasonable default for speech + opts.output_path = nullptr; + + void* data = nullptr; + size_t dataSize = 0; + imp_status_t st = imp_encode_audio(&data, &dataSize, samples, numSamples, &opts); + if (st != IMP_OK || !data) { + if (data) imp_free(data); + return false; + } + + output.assign(static_cast(data), + static_cast(data) + dataSize); + imp_free(data); + return true; +} + +} // namespace imp +} // namespace ovms + +#else // !INTEL_MPI_AVAILABLE + +// ---- Stub implementation (Linux / no GStreamer) ------------------------------ + +namespace ovms { +namespace imp { + +bool isAvailable() { return false; } + +bool decodeAudioBuffer(const std::string_view& /*audioData*/, + uint32_t /*sampleRate*/, + std::vector& /*pcm*/) { + return false; +} + +bool encodeToWav(const std::vector& /*pcm*/, + uint32_t /*sampleRate*/, + uint16_t /*bitsPerSample*/, + std::vector& /*wavData*/) { + return false; +} + +bool encodeAudioBuffer(const float* /*samples*/, + size_t /*numSamples*/, + uint32_t /*sampleRate*/, + uint16_t /*bitsPerSample*/, + const std::string& /*format*/, + std::vector& /*output*/) { + return false; +} + +} // namespace imp +} // namespace ovms + +#endif // INTEL_MPI_AVAILABLE diff --git a/src/mpi/imp_audio_utils.hpp b/src/mpi/imp_audio_utils.hpp new file mode 100644 index 0000000000..332aa0c8c5 --- /dev/null +++ b/src/mpi/imp_audio_utils.hpp @@ -0,0 +1,98 @@ +//***************************************************************************** +// Copyright 2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//***************************************************************************** + +#pragma once + +/** + * @file imp_audio_utils.hpp + * @brief Intel MPI audio utilities — thin wrappers around the Intel MPI + * decode/encode API for use inside OVMS audio calculators. + * + * When INTEL_MPI_AVAILABLE is defined (Windows builds with GStreamer), + * these helpers provide GStreamer-backed audio decode that supports a + * wider range of formats (ogg, flac, aac, opus, …) beyond the basic + * WAV/MP3 support offered by dr_libs. + * + * On Linux (or when GStreamer is not present), the functions return false + * so the caller can fall back to the existing dr_libs path. + */ + +#include +#include +#include +#include + +namespace ovms { +namespace imp { + +/** + * Check whether Intel MPI audio decode is available at runtime. + * Returns true only on Windows builds compiled with INTEL_MPI_AVAILABLE + * AND where GStreamer is actually initialised. + */ +bool isAvailable(); + +/** + * Try to decode an audio buffer (any format GStreamer supports) into + * mono float PCM samples at the requested sample rate. + * + * @param audioData Raw encoded audio bytes (WAV, MP3, OGG, FLAC, …) + * @param sampleRate Desired output sample rate (e.g. 16000) + * @param[out] pcm Output float samples, normalised to [-1, 1] + * @return true on success, false if MPI is unavailable or decode fails. + */ +bool decodeAudioBuffer(const std::string_view& audioData, + uint32_t sampleRate, + std::vector& pcm); + +/** + * Encode float PCM samples to a WAV buffer in memory. + * + * @param pcm Input float samples (mono, normalised [-1,1]) + * @param sampleRate Sample rate (e.g. 16000) + * @param bitsPerSample Bits per sample for output (16 or 32) + * @param[out] wavData Output WAV bytes + * @return true on success. + */ +bool encodeToWav(const std::vector& pcm, + uint32_t sampleRate, + uint16_t bitsPerSample, + std::vector& wavData); + +/** + * Encode float PCM samples to the specified audio format. + * + * Supported formats: "wav", "mp3", "flac", "opus", "aac", "pcm". + * On Windows, non-WAV formats are encoded via the Intel MPI / GStreamer + * pipeline. On Linux (stub), only "wav" and "pcm" are supported. + * + * @param samples Raw float PCM (mono) + * @param numSamples Number of float values + * @param sampleRate Sample rate (e.g. 16000) + * @param bitsPerSample Bits per sample for WAV output (16 or 32) + * @param format Target format string ("wav","mp3","flac",...) + * @param[out] output Encoded bytes + * @return true on success. + */ +bool encodeAudioBuffer(const float* samples, + size_t numSamples, + uint32_t sampleRate, + uint16_t bitsPerSample, + const std::string& format, + std::vector& output); + +} // namespace imp +} // namespace ovms diff --git a/src/mpi/imp_mpi_impl.h b/src/mpi/imp_mpi_impl.h new file mode 100644 index 0000000000..ad90e7f74c --- /dev/null +++ b/src/mpi/imp_mpi_impl.h @@ -0,0 +1,286 @@ +/** + * Intel MPI - Internal Implementation Structures + * + * NOT part of the public API. These are the C++ structs behind the + * opaque handles declared in intel_mpi.h. + * + * Only intel_mpi.cpp should include this file. + */ + +#ifndef IMP_MPI_IMPL_H +#define IMP_MPI_IMPL_H + +#define NOMINMAX +#define WIN32_LEAN_AND_MEAN + +#include +#include +#include +#include +#include +#include + +// Windows + D3D11 +#include +#include +#include + +// OpenVINO C++ API +#include +#include + +// GStreamer +#include +#include +#include +#include +#include +#include + +// Public API types +#include "intel_mpi.h" + +////////////////////////////////////////////////////////////////////////////// +// NV12 frame buffer (used internally for CPU-side NV12 data) +////////////////////////////////////////////////////////////////////////////// + +struct imp_nv12_frame_t { + std::vector y_plane; + std::vector uv_plane; + int width = 0; + int height = 0; + bool valid = false; + bool allocated = false; + + void allocate(int w, int h) { + if (allocated && width == w && height == h) return; + y_plane.resize(w * h); + uv_plane.resize(w * (h / 2)); + width = w; + height = h; + allocated = true; + valid = false; + } +}; + +////////////////////////////////////////////////////////////////////////////// +// Timing data (accumulated per-context for benchmarking) +////////////////////////////////////////////////////////////////////////////// + +struct imp_timing_data_t { + double context_create_ms = 0; + double video_open_ms = 0; + double encoder_open_ms = 0; + double total_decode_fullres_ms = 0; + double total_decode_small_ms = 0; + double total_decode_pull_fullres_ms = 0; + double total_decode_pull_small_ms = 0; + double total_decode_copy_fullres_ms = 0; + double total_decode_copy_small_ms = 0; + double total_inference_ms = 0; + double total_tensor_setup_ms = 0; + double total_infer_call_ms = 0; + double total_output_parse_ms = 0; + double total_async_start_ms = 0; + double total_async_wait_ms = 0; + double total_encode_ms = 0; + double total_display_ms = 0; + double total_overlay_ms = 0; + int async_overlaps = 0; + int frame_count = 0; +}; + +////////////////////////////////////////////////////////////////////////////// +// Context — wraps a compiled model + infer requests +// +// The public API creates this from an already-compiled OV model. +// The caller is responsible for: +// 1. Loading the model XML +// 2. Applying PrePostProcessor (NV12, resize, etc.) +// 3. Compiling the model for a device +// 4. Passing the compiled model here +////////////////////////////////////////////////////////////////////////////// + +struct imp_context_s { + // Device info (extracted from compiled model or created standalone) + imp_device_type_t device_type = IMP_DEVICE_GPU; + std::string device_name; // e.g. "GPU.0" + + // Model dimension hints (for branch auto-deduction) + // Set when context is created from a compiled model. + // Used by imp_video_open when branch width/height are 0. + int model_w = 0; + int model_h = 0; + + bool initialized = false; + std::string last_error; + imp_timing_data_t timing; +}; + +////////////////////////////////////////////////////////////////////////////// +// Tensor handle — wraps frame data for the C API +// +// Can hold either: +// - NV12 frame data (y_plane + uv_plane, format = NV12) +// - An ov::Tensor (for future use) +// +// NOTE: defined here (before imp_branch_info_t) because branches hold +// a tensor_cache instance. +////////////////////////////////////////////////////////////////////////////// + +struct imp_tensor_s { + // NV12 frame data (when format is NV12) + uint8_t* y_data = nullptr; // Y plane pointer (not owned) + uint8_t* uv_data = nullptr; // UV plane pointer (not owned) + int width = 0; + int height = 0; + imp_pixel_format_t format = IMP_FORMAT_NV12; + bool valid = false; + + // Optional ov::Tensor (for non-NV12 or GPU tensors) + ov::Tensor ov_tensor; + imp_device_type_t device_type = IMP_DEVICE_CPU; + std::string device_name; +}; + +////////////////////////////////////////////////////////////////////////////// +// Branch info — one per output branch in the tee pipeline +////////////////////////////////////////////////////////////////////////////// + +struct imp_branch_info_t { + GstElement* appsink = nullptr; + int width = 0; + int height = 0; + std::string name; + imp_nv12_frame_t frame; // pre-allocated frame buffer + imp_tensor_s tensor_cache; // reusable tensor wrapper + + // Per-branch timing + double total_decode_ms = 0; + double total_decode_pull_ms = 0; + double total_decode_copy_ms = 0; +}; + +////////////////////////////////////////////////////////////////////////////// +// Video stream — GStreamer tee pipeline with N output branches +////////////////////////////////////////////////////////////////////////////// + +struct imp_video_stream_s { + GstElement* pipeline = nullptr; + bool use_hw_decode = false; + + // Dynamic branches (replaces fixed fullres + small appsinks) + std::vector branches; + + // Timing reference + imp_timing_data_t* timing = nullptr; +}; + +////////////////////////////////////////////////////////////////////////////// +// Video encoder — GStreamer appsrc → encoder → mux → file +////////////////////////////////////////////////////////////////////////////// + +struct imp_video_encoder_s { + GstElement* pipeline = nullptr; + GstElement* appsrc = nullptr; + int width = 0; + int height = 0; + int fps_num = 30; + int fps_den = 1; + bool initialized = false; + bool is_gpu_encoder = false; + std::string encoder_name; + int64_t frame_count = 0; + + // Timing reference + imp_timing_data_t* timing = nullptr; +}; + +////////////////////////////////////////////////////////////////////////////// +// Video source configuration +////////////////////////////////////////////////////////////////////////////// + +struct imp_video_source_s { + imp_source_type_t type = IMP_SOURCE_FILE; + std::string path; // file path or URL + std::string device; // camera device id + int width = 0; + int height = 0; + int framerate = 0; + std::string format; // capture format + // Network properties + std::string transport; + std::string username; + std::string password; + int timeout_ms = 0; + bool low_latency = false; +}; + +////////////////////////////////////////////////////////////////////////////// +// Detection result (demo-level, not part of API) +////////////////////////////////////////////////////////////////////////////// + +struct imp_detection_t { + float confidence = 0.0f; + int x_min = 0, y_min = 0, x_max = 0, y_max = 0; +}; + +////////////////////////////////////////////////////////////////////////////// +// Audio encoder — GStreamer pipeline for encoding audio samples to file +////////////////////////////////////////////////////////////////////////////// + +struct imp_audio_encoder_s { + GstElement* pipeline = nullptr; + GstElement* appsrc = nullptr; + + // Output configuration + std::string output_path; + std::string codec; // "mp3", "aac", "flac", "wav", "opus" + uint32_t bitrate_kbps = 192; + uint32_t sample_rate = 44100; + uint32_t channels = 2; + + // Callback for completion notification + imp_encode_callback_t callback = nullptr; + void* user_data = nullptr; + + // State + bool initialized = false; + int64_t samples_written = 0; + + std::string last_error; +}; + +////////////////////////////////////////////////////////////////////////////// +// Audio stream — internal state for decode→encode pipeline +////////////////////////////////////////////////////////////////////////////// + +struct imp_audio_stream_s { + // Input + std::string input_path; + + // Output + std::string output_path; + std::string output_codec; + uint32_t output_sample_rate = 44100; + uint32_t output_channels = 2; + uint32_t output_bitrate_kbps = 192; + + // Discovered info + uint32_t sample_rate = 0; + uint32_t channels = 0; + double duration_sec = 0.0; + int64_t num_samples = 0; + + // Pipeline + GstElement* pipeline = nullptr; + + // State + bool initialized = false; + bool processed = false; + double wall_time_sec = 0.0; + + std::string last_error; +}; + +#endif // IMP_MPI_IMPL_H diff --git a/src/mpi/intel_mpi.cpp b/src/mpi/intel_mpi.cpp new file mode 100644 index 0000000000..5e4122331f --- /dev/null +++ b/src/mpi/intel_mpi.cpp @@ -0,0 +1,1531 @@ +/** + * Intel MPI - API Implementation (v8 — Layer 1 Media I/O) + * + * Implements the C-compatible functions declared in intel_mpi.h. + * Uses the internal C++ structs from imp_mpi_impl.h. + * + * This is a PURE MEDIA layer: + * - Context = device handle (extracts GPU device from model, or standalone) + * - Video decode = N-branch tee pipeline (caller defines branches) + * - Tensor = NV12 frame wrapper returned by imp_video_read_frame + * - Encoder = accepts imp_tensor_t* directly + * + * NO inference logic lives here. Inference helpers belong in the demo/OVMS layer. + */ + +#define NOMINMAX +#define WIN32_LEAN_AND_MEAN +#ifndef GST_USE_UNSTABLE_API +#define GST_USE_UNSTABLE_API +#endif + +#include "imp_mpi_impl.h" // same directory + +#include +#include +#include + +namespace chrono = std::chrono; + +////////////////////////////////////////////////////////////////////////////// +// Version +////////////////////////////////////////////////////////////////////////////// + +void imp_get_version(int* major, int* minor, int* patch) { + if (major) *major = IMP_VERSION_MAJOR; + if (minor) *minor = IMP_VERSION_MINOR; + if (patch) *patch = IMP_VERSION_PATCH; +} + +////////////////////////////////////////////////////////////////////////////// +// Context — device handle only, no inference +////////////////////////////////////////////////////////////////////////////// + +imp_status_t imp_context_create(imp_context_t** ctx, + ov_compiled_model_t* compiled_model) { + if (!ctx) return IMP_ERROR_INVALID_ARGUMENT; + + // In this POC, compiled_model is actually a C++ ov::CompiledModel* + // cast through the C opaque pointer. + auto* cm = reinterpret_cast(compiled_model); + if (!cm) return IMP_ERROR_INVALID_ARGUMENT; + + auto start = chrono::high_resolution_clock::now(); + + auto* c = new imp_context_s(); + + // Extract model input dimensions for branch auto-deduction. + // After NV12 PPP the Y input is [1, H, W, 1]. + auto inputs = cm->inputs(); + if (!inputs.empty()) { + auto shape = inputs[0].get_shape(); + if (shape.size() == 4) { + c->model_h = static_cast(shape[1]); + c->model_w = static_cast(shape[2]); + } + } + + c->device_type = IMP_DEVICE_GPU; + c->device_name = "GPU"; + c->initialized = true; + + auto end = chrono::high_resolution_clock::now(); + c->timing.context_create_ms = chrono::duration(end - start).count(); + + *ctx = c; + return IMP_OK; +} + +imp_status_t imp_context_create_from_remote(imp_context_t** ctx, + ov_remote_context_t* remote_ctx, + imp_context_type_t type) { + (void)ctx; (void)remote_ctx; (void)type; + return IMP_ERROR_INTERNAL; // Not yet implemented +} + +imp_status_t imp_context_get_native(imp_context_t* ctx, + imp_context_type_t* type, + void** native_handle) { + (void)ctx; (void)type; (void)native_handle; + return IMP_ERROR_INTERNAL; // Not yet implemented +} + +imp_status_t imp_context_get_ov_remote(imp_context_t* ctx, + ov_remote_context_t** remote_ctx) { + (void)ctx; (void)remote_ctx; + return IMP_ERROR_INTERNAL; // Not yet implemented +} + +imp_status_t imp_context_get_device_type(imp_context_t* ctx, + imp_device_type_t* device_type) { + if (!ctx || !device_type) return IMP_ERROR_INVALID_ARGUMENT; + *device_type = ctx->device_type; + return IMP_OK; +} + +imp_status_t imp_context_get_device_name(imp_context_t* ctx, + const char** device_name) { + if (!ctx || !device_name) return IMP_ERROR_INVALID_ARGUMENT; + *device_name = ctx->device_name.c_str(); + return IMP_OK; +} + +void imp_context_destroy(imp_context_t* ctx) { + delete ctx; +} + +const char* imp_context_get_error(imp_context_t* ctx) { + if (!ctx) return "null context"; + return ctx->last_error.c_str(); +} + +////////////////////////////////////////////////////////////////////////////// +// Video Source Configuration +////////////////////////////////////////////////////////////////////////////// + +imp_status_t imp_video_source_create(imp_video_source_t** source, + imp_source_type_t type) { + if (!source) return IMP_ERROR_INVALID_ARGUMENT; + auto* s = new imp_video_source_s(); + s->type = type; + *source = s; + return IMP_OK; +} + +imp_status_t imp_video_source_set(imp_video_source_t* source, + const char* key, + const char* value) { + if (!source || !key || !value) return IMP_ERROR_INVALID_ARGUMENT; + + std::string k(key), v(value); + if (k == "path" || k == "url") source->path = v; + else if (k == "device") source->device = v; + else if (k == "width") source->width = std::stoi(v); + else if (k == "height") source->height = std::stoi(v); + else if (k == "framerate") source->framerate = std::stoi(v); + else if (k == "format") source->format = v; + else if (k == "transport") source->transport = v; + else if (k == "username") source->username = v; + else if (k == "password") source->password = v; + else if (k == "timeout") source->timeout_ms = std::stoi(v); + else if (k == "low_latency") source->low_latency = (v == "1" || v == "true"); + else return IMP_ERROR_INVALID_ARGUMENT; + + return IMP_OK; +} + +void imp_video_source_destroy(imp_video_source_t* source) { + delete source; +} + +////////////////////////////////////////////////////////////////////////////// +// Internal: read NV12 from appsink into frame buffer +////////////////////////////////////////////////////////////////////////////// + +static bool read_nv12_from_appsink(GstElement* appsink, imp_nv12_frame_t* frame, + double& total_ms, double& pull_ms, double& copy_ms) { + auto decode_start = chrono::high_resolution_clock::now(); + + auto ps = chrono::high_resolution_clock::now(); + GstSample* sample = gst_app_sink_try_pull_sample(GST_APP_SINK(appsink), GST_SECOND); + if (!sample) return false; + pull_ms += chrono::duration( + chrono::high_resolution_clock::now() - ps).count(); + + GstBuffer* buffer = gst_sample_get_buffer(sample); + GstCaps* caps = gst_sample_get_caps(sample); + + GstVideoInfo info; + gst_video_info_from_caps(&info, caps); + + frame->allocate(info.width, info.height); + frame->valid = false; + + auto cs = chrono::high_resolution_clock::now(); + + GstVideoFrame vframe; + if (gst_video_frame_map(&vframe, &info, buffer, GST_MAP_READ)) { + uint8_t* y_data = (uint8_t*)GST_VIDEO_FRAME_PLANE_DATA(&vframe, 0); + int y_stride = GST_VIDEO_FRAME_PLANE_STRIDE(&vframe, 0); + + if (y_stride == frame->width) { + memcpy(frame->y_plane.data(), y_data, frame->width * frame->height); + } else { + for (int row = 0; row < frame->height; row++) + memcpy(frame->y_plane.data() + row * frame->width, + y_data + row * y_stride, frame->width); + } + + uint8_t* uv_data = (uint8_t*)GST_VIDEO_FRAME_PLANE_DATA(&vframe, 1); + int uv_stride = GST_VIDEO_FRAME_PLANE_STRIDE(&vframe, 1); + + if (uv_stride == frame->width) { + memcpy(frame->uv_plane.data(), uv_data, frame->width * (frame->height / 2)); + } else { + for (int row = 0; row < frame->height / 2; row++) + memcpy(frame->uv_plane.data() + row * frame->width, + uv_data + row * uv_stride, frame->width); + } + + frame->valid = true; + gst_video_frame_unmap(&vframe); + } + + gst_sample_unref(sample); + + copy_ms += chrono::duration( + chrono::high_resolution_clock::now() - cs).count(); + total_ms += chrono::duration( + chrono::high_resolution_clock::now() - decode_start).count(); + + return frame->valid; +} + +////////////////////////////////////////////////////////////////////////////// +// Video Stream — N-branch tee pipeline +////////////////////////////////////////////////////////////////////////////// + +imp_status_t imp_video_open(imp_video_stream_t** stream, + imp_video_source_t* source, + imp_context_t* ctx, + const imp_video_decode_opts_t* opts) { + if (!stream || !source || !ctx) return IMP_ERROR_INVALID_ARGUMENT; + + auto start = chrono::high_resolution_clock::now(); + + gst_init(nullptr, nullptr); + + auto* s = new imp_video_stream_s(); + s->timing = &ctx->timing; + + // Determine source resolution from source config + int src_w = (source->width > 0) ? source->width : 1280; + int src_h = (source->height > 0) ? source->height : 720; + + bool is_file = (source->type == IMP_SOURCE_FILE); + bool is_camera = (source->type == IMP_SOURCE_CAMERA); + + // Build resolved branch list. + // If no branches specified, create a single branch at source resolution. + struct resolved_branch { + int width; + int height; + std::string name; + }; + std::vector resolved; + + if (opts && opts->branches && opts->branch_count > 0) { + for (uint32_t i = 0; i < opts->branch_count; i++) { + resolved_branch rb; + int bw = (int)opts->branches[i].width; + int bh = (int)opts->branches[i].height; + + // Layered resolution: explicit > model dims > source res + if (bw > 0 && bh > 0) { + rb.width = bw; + rb.height = bh; + } else if (ctx->model_w > 0 && ctx->model_h > 0) { + rb.width = ctx->model_w; + rb.height = ctx->model_h; + } else { + rb.width = src_w; + rb.height = src_h; + } + + rb.name = opts->branches[i].name ? opts->branches[i].name + : ("branch_" + std::to_string(i)); + resolved.push_back(rb); + } + } else { + // Default: single branch at source resolution + resolved.push_back({src_w, src_h, "default"}); + } + + // Determine if we need a tee (multiple branches or single branch != source res) + bool needs_tee = (resolved.size() > 1); + + // Build GStreamer pipeline string + std::string pipeline_str; + + // Queue params differ for file vs camera + std::string queue_file = "queue max-size-buffers=0 max-size-time=0 max-size-bytes=0"; + std::string queue_cam = "queue max-size-buffers=2 leaky=downstream"; + std::string appsink_file = "emit-signals=false sync=false max-buffers=0 drop=false"; + std::string appsink_cam = "emit-signals=false sync=false max-buffers=2 drop=true"; + std::string queue_str = is_file ? queue_file : queue_cam; + std::string sink_props = is_file ? appsink_file : appsink_cam; + + // Source + decode + NV12 convert + if (is_file) { + pipeline_str = + "filesrc location=\"" + source->path + "\" ! " + "decodebin ! " + "d3d11upload ! " + "d3d11convert ! " + "video/x-raw(memory:D3D11Memory),format=NV12," + "width=" + std::to_string(src_w) + ",height=" + std::to_string(src_h); + } else { + pipeline_str = + "mfvideosrc ! " + "video/x-raw,width=" + std::to_string(src_w) + ",height=" + std::to_string(src_h) + " ! " + "d3d11upload ! " + "d3d11convert ! " + "video/x-raw(memory:D3D11Memory),format=NV12," + "width=" + std::to_string(src_w) + ",height=" + std::to_string(src_h); + } + + if (needs_tee) { + pipeline_str += " ! tee name=t"; + + for (size_t i = 0; i < resolved.size(); i++) { + auto& rb = resolved[i]; + std::string sink_name = "branch_" + std::to_string(i); + + pipeline_str += " t. ! " + queue_str + " ! "; + + // If branch resolution differs from source, add d3d11scale + if (rb.width != src_w || rb.height != src_h) { + pipeline_str += + "d3d11scale ! " + "video/x-raw(memory:D3D11Memory),format=NV12," + "width=" + std::to_string(rb.width) + ",height=" + std::to_string(rb.height) + " ! "; + } + + pipeline_str += + "d3d11download ! " + "video/x-raw,format=NV12," + "width=" + std::to_string(rb.width) + ",height=" + std::to_string(rb.height) + " ! " + "appsink name=" + sink_name + " " + sink_props; + } + } else { + // Single branch — no tee needed + auto& rb = resolved[0]; + + if (rb.width != src_w || rb.height != src_h) { + pipeline_str += + " ! d3d11scale ! " + "video/x-raw(memory:D3D11Memory),format=NV12," + "width=" + std::to_string(rb.width) + ",height=" + std::to_string(rb.height); + } + + pipeline_str += + " ! d3d11download ! " + "video/x-raw,format=NV12," + "width=" + std::to_string(rb.width) + ",height=" + std::to_string(rb.height) + " ! " + "appsink name=branch_0 " + sink_props; + } + + std::cout << "Pipeline: " << pipeline_str << std::endl; + + GError* error = nullptr; + s->pipeline = gst_parse_launch(pipeline_str.c_str(), &error); + if (error || !s->pipeline) { + ctx->last_error = error ? error->message : "pipeline creation failed"; + if (error) g_error_free(error); + delete s; + return IMP_ERROR_DECODE_FAILED; + } + + // Retrieve appsinks and populate branch info + s->branches.resize(resolved.size()); + for (size_t i = 0; i < resolved.size(); i++) { + std::string sink_name = "branch_" + std::to_string(i); + s->branches[i].appsink = gst_bin_get_by_name(GST_BIN(s->pipeline), sink_name.c_str()); + if (!s->branches[i].appsink) { + ctx->last_error = "failed to get appsink: " + sink_name; + gst_object_unref(s->pipeline); + delete s; + return IMP_ERROR_DECODE_FAILED; + } + s->branches[i].width = resolved[i].width; + s->branches[i].height = resolved[i].height; + s->branches[i].name = resolved[i].name; + s->branches[i].frame.allocate(resolved[i].width, resolved[i].height); + } + + gst_element_set_state(s->pipeline, GST_STATE_PLAYING); + + GstStateChangeReturn ret = gst_element_get_state( + s->pipeline, nullptr, nullptr, 5 * GST_SECOND); + if (ret == GST_STATE_CHANGE_FAILURE) { + ctx->last_error = "pipeline failed to start"; + for (auto& b : s->branches) { + if (b.appsink) gst_object_unref(b.appsink); + } + gst_object_unref(s->pipeline); + delete s; + return IMP_ERROR_DECODE_FAILED; + } + + // Get actual dimensions from first branch negotiated caps + GstPad* pad = gst_element_get_static_pad(s->branches[0].appsink, "sink"); + if (pad) { + GstCaps* caps = gst_pad_get_current_caps(pad); + if (caps) { + GstStructure* st = gst_caps_get_structure(caps, 0); + int actual_w = 0, actual_h = 0; + gst_structure_get_int(st, "width", &actual_w); + gst_structure_get_int(st, "height", &actual_h); + if (actual_w > 0) s->branches[0].width = actual_w; + if (actual_h > 0) s->branches[0].height = actual_h; + gst_caps_unref(caps); + } + gst_object_unref(pad); + } + + s->use_hw_decode = true; + + auto end = chrono::high_resolution_clock::now(); + ctx->timing.video_open_ms = chrono::duration(end - start).count(); + + // Print branch info + for (size_t i = 0; i < s->branches.size(); i++) { + auto& b = s->branches[i]; + std::cout << "Branch " << i << " [" << b.name << "]: " + << b.width << "x" << b.height << " NV12" << std::endl; + } + if (s->branches.size() > 1) { + std::cout << "GPU resize via d3d11scale (on D3D11)" << std::endl; + } + + *stream = s; + return IMP_OK; +} + +////////////////////////////////////////////////////////////////////////////// +// Video Read Frame — branch-aware +////////////////////////////////////////////////////////////////////////////// + +imp_status_t imp_video_read_frame(imp_tensor_t** tensor, + imp_video_stream_t* stream, + uint32_t branch_index) { + if (!stream || branch_index >= (uint32_t)stream->branches.size()) + return IMP_ERROR_INVALID_ARGUMENT; + + auto& branch = stream->branches[branch_index]; + + bool ok = read_nv12_from_appsink( + branch.appsink, &branch.frame, + branch.total_decode_ms, + branch.total_decode_pull_ms, + branch.total_decode_copy_ms); + + if (!ok) return IMP_ERROR_STREAM_END; + + // Fill tensor wrapper pointing to the branch's frame data + branch.tensor_cache.y_data = branch.frame.y_plane.data(); + branch.tensor_cache.uv_data = branch.frame.uv_plane.data(); + branch.tensor_cache.width = branch.frame.width; + branch.tensor_cache.height = branch.frame.height; + branch.tensor_cache.format = IMP_FORMAT_NV12; + branch.tensor_cache.valid = true; + branch.tensor_cache.device_type = IMP_DEVICE_CPU; + + if (tensor) *tensor = &branch.tensor_cache; + return IMP_OK; +} + +imp_status_t imp_video_read_frame_by_name(imp_tensor_t** tensor, + imp_video_stream_t* stream, + const char* branch_name) { + if (!stream || !branch_name) return IMP_ERROR_INVALID_ARGUMENT; + + for (uint32_t i = 0; i < (uint32_t)stream->branches.size(); i++) { + if (stream->branches[i].name == branch_name) { + return imp_video_read_frame(tensor, stream, i); + } + } + return IMP_ERROR_INVALID_ARGUMENT; // branch name not found +} + +imp_status_t imp_video_start_async(imp_video_stream_t* stream, + imp_video_frame_callback_t callback, + void* user_data) { + (void)stream; (void)callback; (void)user_data; + return IMP_ERROR_INTERNAL; // Not yet implemented +} + +void imp_video_stop(imp_video_stream_t* stream) { + (void)stream; +} + +imp_status_t imp_video_get_info(imp_video_stream_t* stream, + uint32_t* width, uint32_t* height, + float* fps, int64_t* frame_count) { + if (!stream || stream->branches.empty()) return IMP_ERROR_INVALID_ARGUMENT; + // Return first branch dimensions as the "primary" stream info + if (width) *width = (uint32_t)stream->branches[0].width; + if (height) *height = (uint32_t)stream->branches[0].height; + if (fps) *fps = 30.0f; // TODO: detect from stream + if (frame_count) *frame_count = -1; // unknown for live + return IMP_OK; +} + +void imp_video_close(imp_video_stream_t* stream) { + if (!stream) return; + if (stream->pipeline) { + gst_element_set_state(stream->pipeline, GST_STATE_NULL); + for (auto& b : stream->branches) { + if (b.appsink) gst_object_unref(b.appsink); + } + gst_object_unref(stream->pipeline); + } + delete stream; +} + +////////////////////////////////////////////////////////////////////////////// +// Video Encoder +////////////////////////////////////////////////////////////////////////////// + +imp_status_t imp_video_encoder_create(imp_video_encoder_t** encoder, + uint32_t width, uint32_t height, + imp_context_t* ctx, + const imp_video_encode_opts_t* opts) { + if (!encoder || !ctx) return IMP_ERROR_INVALID_ARGUMENT; + + auto* e = new imp_video_encoder_s(); + e->width = (int)width; + e->height = (int)height; + e->fps_num = (opts && opts->framerate > 0) ? (int)opts->framerate : 30; + e->fps_den = 1; + e->timing = &ctx->timing; + + std::string output_path = (opts && opts->output_path) ? opts->output_path : "output.mp4"; + + std::cout << "=== Opening GPU encoder: " << output_path << " ===" << std::endl; + + std::string pipeline = + "appsrc name=src format=time is-live=true do-timestamp=true ! " + "video/x-raw,format=NV12,width=" + std::to_string(width) + + ",height=" + std::to_string(height) + + ",framerate=" + std::to_string(e->fps_num) + "/" + std::to_string(e->fps_den) + " ! " + "mfh264enc ! h264parse ! mp4mux ! filesink location=\"" + output_path + "\""; + + GError* error = nullptr; + e->pipeline = gst_parse_launch(pipeline.c_str(), &error); + if (error) { + ctx->last_error = error->message; + g_error_free(error); + delete e; + return IMP_ERROR_ENCODE_FAILED; + } + + e->appsrc = gst_bin_get_by_name(GST_BIN(e->pipeline), "src"); + if (!e->appsrc) { + gst_object_unref(e->pipeline); + delete e; + return IMP_ERROR_ENCODE_FAILED; + } + + g_object_set(e->appsrc, "stream-type", 0, "format", GST_FORMAT_TIME, nullptr); + + GstStateChangeReturn ret = gst_element_set_state(e->pipeline, GST_STATE_PLAYING); + if (ret == GST_STATE_CHANGE_FAILURE) { + gst_object_unref(e->appsrc); + gst_object_unref(e->pipeline); + delete e; + return IMP_ERROR_ENCODE_FAILED; + } + gst_element_get_state(e->pipeline, nullptr, nullptr, 500 * GST_MSECOND); + + e->encoder_name = "mfh264enc"; + e->is_gpu_encoder = true; + e->initialized = true; + + std::cout << "GPU encoder opened: mfh264enc" << std::endl; + + *encoder = e; + return IMP_OK; +} + +/** + * Encode a frame from an imp_tensor_t (NV12 format). + * The tensor must have valid y_data and uv_data pointers. + */ +imp_status_t imp_video_encoder_write(imp_video_encoder_t* encoder, + imp_tensor_t* tensor) { + if (!encoder || !encoder->initialized || !tensor || !tensor->valid) + return IMP_ERROR_INVALID_ARGUMENT; + + auto encode_start = chrono::high_resolution_clock::now(); + + int w = tensor->width; + int h = tensor->height; + size_t y_size = (size_t)w * h; + size_t uv_size = (size_t)w * (h / 2); + + GstBuffer* buffer = gst_buffer_new_allocate(nullptr, y_size + uv_size, nullptr); + if (!buffer) return IMP_ERROR_ENCODE_FAILED; + + GstMapInfo map; + if (!gst_buffer_map(buffer, &map, GST_MAP_WRITE)) { + gst_buffer_unref(buffer); + return IMP_ERROR_ENCODE_FAILED; + } + + memcpy(map.data, tensor->y_data, y_size); + memcpy(map.data + y_size, tensor->uv_data, uv_size); + gst_buffer_unmap(buffer, &map); + + GstClockTime duration = gst_util_uint64_scale(GST_SECOND, + encoder->fps_den, encoder->fps_num); + GST_BUFFER_PTS(buffer) = encoder->frame_count * duration; + GST_BUFFER_DURATION(buffer) = duration; + + GstFlowReturn ret = gst_app_src_push_buffer(GST_APP_SRC(encoder->appsrc), buffer); + if (ret != GST_FLOW_OK) return IMP_ERROR_ENCODE_FAILED; + + encoder->frame_count++; + + if (encoder->timing) { + encoder->timing->total_encode_ms += chrono::duration( + chrono::high_resolution_clock::now() - encode_start).count(); + } + + return IMP_OK; +} + +void imp_video_encoder_close(imp_video_encoder_t* encoder) { + if (!encoder || !encoder->initialized) return; + + gst_app_src_end_of_stream(GST_APP_SRC(encoder->appsrc)); + + GstBus* bus = gst_element_get_bus(encoder->pipeline); + if (bus) { + GstMessage* msg = gst_bus_timed_pop_filtered(bus, 5 * GST_SECOND, + (GstMessageType)(GST_MESSAGE_EOS | GST_MESSAGE_ERROR)); + if (msg) gst_message_unref(msg); + gst_object_unref(bus); + } + + gst_element_set_state(encoder->pipeline, GST_STATE_NULL); + gst_object_unref(encoder->appsrc); + gst_object_unref(encoder->pipeline); + + std::cout << "Encoder closed. Frames: " << encoder->frame_count << std::endl; + delete encoder; +} + +////////////////////////////////////////////////////////////////////////////// +// Tensor utilities +////////////////////////////////////////////////////////////////////////////// + +imp_status_t imp_tensor_get_device_type(imp_tensor_t* tensor, + imp_device_type_t* device_type) { + if (!tensor || !device_type) return IMP_ERROR_INVALID_ARGUMENT; + *device_type = tensor->device_type; + return IMP_OK; +} + +imp_status_t imp_tensor_get_device_name(imp_tensor_t* tensor, + const char** device_name) { + if (!tensor || !device_name) return IMP_ERROR_INVALID_ARGUMENT; + *device_name = tensor->device_name.c_str(); + return IMP_OK; +} + +imp_status_t imp_tensor_get_context_type(imp_tensor_t* tensor, + imp_context_type_t* context_type) { + (void)tensor; (void)context_type; + return IMP_ERROR_INTERNAL; +} + +imp_status_t imp_tensor_get_ov(imp_tensor_t* tensor, + void** ov_tensor, + imp_device_type_t* device_type) { + if (!tensor || !ov_tensor) return IMP_ERROR_INVALID_ARGUMENT; + *ov_tensor = &tensor->ov_tensor; + if (device_type) *device_type = tensor->device_type; + return IMP_OK; +} + +imp_status_t imp_tensor_get_shape(imp_tensor_t* tensor, + int64_t* dims, + size_t* num_dims) { + if (!tensor || !dims || !num_dims) return IMP_ERROR_INVALID_ARGUMENT; + + // For NV12 tensors, report [height, width] as the shape + if (tensor->format == IMP_FORMAT_NV12) { + size_t n = std::min(*num_dims, (size_t)2); + if (n >= 1) dims[0] = tensor->height; + if (n >= 2) dims[1] = tensor->width; + *num_dims = 2; + return IMP_OK; + } + + // For ov::Tensor backed tensors + auto shape = tensor->ov_tensor.get_shape(); + size_t n = std::min(*num_dims, shape.size()); + for (size_t i = 0; i < n; i++) dims[i] = (int64_t)shape[i]; + *num_dims = shape.size(); + return IMP_OK; +} + +imp_status_t imp_tensor_get_element_type(imp_tensor_t* tensor, + imp_element_type_t* type) { + if (!tensor || !type) return IMP_ERROR_INVALID_ARGUMENT; + // NV12 tensors are always U8 + if (tensor->format == IMP_FORMAT_NV12) { + *type = IMP_TYPE_U8; + return IMP_OK; + } + return IMP_ERROR_INTERNAL; // TODO: map ov::element::Type +} + +void imp_tensor_release(imp_tensor_t* tensor) { + if (!tensor) return; + delete tensor; +} + +void imp_free(void* ptr) { + free(ptr); +} + +////////////////////////////////////////////////////////////////////////////// +// HW support queries +////////////////////////////////////////////////////////////////////////////// + +imp_status_t imp_hw_decode_supported(imp_context_t* ctx, bool* supported) { + (void)ctx; + if (supported) *supported = true; + return IMP_OK; +} + +imp_status_t imp_hw_encode_supported(imp_context_t* ctx, bool* supported) { + (void)ctx; + if (supported) *supported = true; + return IMP_OK; +} + +////////////////////////////////////////////////////////////////////////////// +// Image decode / encode — stubs +////////////////////////////////////////////////////////////////////////////// + +imp_status_t imp_decode_image(imp_tensor_t** t, const void* d, size_t s, + imp_context_t* c, const imp_image_decode_opts_t* o, + imp_decode_callback_t cb, void* ud) { + (void)t;(void)d;(void)s;(void)c;(void)o;(void)cb;(void)ud; + return IMP_ERROR_INTERNAL; +} + +imp_status_t imp_decode_image_file(imp_tensor_t** t, const char* f, + imp_context_t* c, const imp_image_decode_opts_t* o, + imp_decode_callback_t cb, void* ud) { + (void)t;(void)f;(void)c;(void)o;(void)cb;(void)ud; + return IMP_ERROR_INTERNAL; +} + +imp_status_t imp_decode_audio(imp_tensor_t** tensor, const void* data, size_t size, + imp_context_t* ctx, const imp_audio_decode_opts_t* opts, + imp_decode_callback_t callback, void* user_data) { + (void)callback; (void)user_data; // async not yet implemented + if (!tensor || !data || size == 0) return IMP_ERROR_INVALID_ARGUMENT; + + gst_init(nullptr, nullptr); + + // Resolve options + uint32_t target_rate = (opts && opts->sample_rate > 0) ? opts->sample_rate : 16000; + uint32_t target_channels = (opts && opts->channels > 0) ? opts->channels : 1; + bool normalize = opts ? opts->normalize : true; + (void)normalize; // GStreamer F32LE is already [-1,1] + + // Build pipeline: appsrc → decodebin → audioconvert → audioresample → caps → appsink + std::string capsStr = + "audio/x-raw,format=F32LE,channels=" + std::to_string(target_channels) + + ",rate=" + std::to_string(target_rate); + + std::string pipelineStr = + "appsrc name=src ! decodebin ! audioconvert ! audioresample ! " + + capsStr + " ! appsink name=sink sync=false"; + + GError* error = nullptr; + GstElement* pipeline = gst_parse_launch(pipelineStr.c_str(), &error); + if (error || !pipeline) { + if (ctx) ctx->last_error = error ? error->message : "audio pipeline creation failed"; + if (error) g_error_free(error); + return IMP_ERROR_DECODE_FAILED; + } + + GstElement* appsrc = gst_bin_get_by_name(GST_BIN(pipeline), "src"); + GstElement* appsink = gst_bin_get_by_name(GST_BIN(pipeline), "sink"); + if (!appsrc || !appsink) { + if (appsrc) gst_object_unref(appsrc); + if (appsink) gst_object_unref(appsink); + gst_object_unref(pipeline); + return IMP_ERROR_INTERNAL; + } + + // Allocate GStreamer buffer and copy input data + GstBuffer* buf = gst_buffer_new_allocate(nullptr, size, nullptr); + GstMapInfo map; + if (gst_buffer_map(buf, &map, GST_MAP_WRITE)) { + memcpy(map.data, data, size); + gst_buffer_unmap(buf, &map); + } + + gst_element_set_state(pipeline, GST_STATE_PLAYING); + + // Push buffer + EOS + gst_app_src_push_buffer(GST_APP_SRC(appsrc), buf); // takes ownership of buf + gst_app_src_end_of_stream(GST_APP_SRC(appsrc)); + + // Pull all decoded float samples + std::vector samples; + while (true) { + GstSample* sample = gst_app_sink_try_pull_sample(GST_APP_SINK(appsink), 5 * GST_SECOND); + if (!sample) break; + + GstBuffer* outBuf = gst_sample_get_buffer(sample); + GstMapInfo outMap; + if (gst_buffer_map(outBuf, &outMap, GST_MAP_READ)) { + size_t numFloats = outMap.size / sizeof(float); + const float* fdata = reinterpret_cast(outMap.data); + samples.insert(samples.end(), fdata, fdata + numFloats); + gst_buffer_unmap(outBuf, &outMap); + } + gst_sample_unref(sample); + } + + // Check for pipeline errors + imp_status_t status = IMP_OK; + GstBus* bus = gst_element_get_bus(pipeline); + GstMessage* msg = gst_bus_pop_filtered(bus, + static_cast(GST_MESSAGE_ERROR)); + if (msg) { + GError* err = nullptr; + gst_message_parse_error(msg, &err, nullptr); + if (ctx && err) ctx->last_error = err->message; + if (err) g_error_free(err); + gst_message_unref(msg); + status = IMP_ERROR_DECODE_FAILED; + } + gst_object_unref(bus); + + gst_element_set_state(pipeline, GST_STATE_NULL); + gst_object_unref(appsrc); + gst_object_unref(appsink); + gst_object_unref(pipeline); + + if (status != IMP_OK || samples.empty()) { + return status != IMP_OK ? status : IMP_ERROR_DECODE_FAILED; + } + + // Wrap samples in an imp_tensor_t backed by ov::Tensor + auto* t = new imp_tensor_s(); + t->ov_tensor = ov::Tensor(ov::element::f32, {1, samples.size()}); + std::memcpy(t->ov_tensor.data(), samples.data(), samples.size() * sizeof(float)); + t->device_type = IMP_DEVICE_CPU; + t->device_name = "CPU"; + t->format = IMP_FORMAT_GRAY; // 1-D audio — not a pixel format, but marks non-NV12 + t->valid = true; + + *tensor = t; + return IMP_OK; +} + +imp_status_t imp_decode_audio_file(imp_tensor_t** t, const char* f, + imp_context_t* c, const imp_audio_decode_opts_t* o, + imp_decode_callback_t cb, void* ud) { + (void)t;(void)f;(void)c;(void)o;(void)cb;(void)ud; + return IMP_ERROR_INTERNAL; // TODO: implement - decode file to tensor with all samples +} + +imp_status_t imp_audio_file_info(const char* file_path, + uint32_t* sample_rate, + uint32_t* channels, + double* duration_sec) { + if (!file_path) { + return IMP_ERROR_INVALID_ARGUMENT; + } + + // Convert to URI + std::string uri = "file:///"; + for (const char* p = file_path; *p; ++p) { + if (*p == '\\') uri += '/'; + else if (*p == ' ') uri += "%20"; + else uri += *p; + } + + GError* error = nullptr; + GstDiscoverer* discoverer = gst_discoverer_new(10 * GST_SECOND, &error); + if (!discoverer) { + if (error) g_error_free(error); + return IMP_ERROR_DECODE_FAILED; + } + + GstDiscovererInfo* info = gst_discoverer_discover_uri(discoverer, uri.c_str(), &error); + if (!info) { + if (error) g_error_free(error); + g_object_unref(discoverer); + return IMP_ERROR_DECODE_FAILED; + } + + // Get duration + if (duration_sec) { + GstClockTime dur = gst_discoverer_info_get_duration(info); + *duration_sec = (double)dur / GST_SECOND; + } + + // Get audio stream info + GList* audio_streams = gst_discoverer_info_get_audio_streams(info); + if (audio_streams) { + GstDiscovererAudioInfo* audio_info = (GstDiscovererAudioInfo*)audio_streams->data; + if (sample_rate) { + *sample_rate = gst_discoverer_audio_info_get_sample_rate(audio_info); + } + if (channels) { + *channels = gst_discoverer_audio_info_get_channels(audio_info); + } + gst_discoverer_stream_info_list_free(audio_streams); + } else { + // No audio stream found + if (sample_rate) *sample_rate = 0; + if (channels) *channels = 0; + } + + gst_discoverer_info_unref(info); + g_object_unref(discoverer); + + return IMP_OK; +} + +////////////////////////////////////////////////////////////////////////////// +// Audio Stream Implementation (high-level decode → encode) +////////////////////////////////////////////////////////////////////////////// + +// Helper: Get audio duration using GstDiscoverer +static double imp_audio_get_duration(const std::string& filepath) { + GError* error = nullptr; + + std::string uri = "file:///"; + for (char c : filepath) { + if (c == '\\') uri += '/'; + else if (c == ' ') uri += "%20"; + else uri += c; + } + + GstDiscoverer* discoverer = gst_discoverer_new(10 * GST_SECOND, &error); + if (!discoverer) { + if (error) g_error_free(error); + return 0.0; + } + + GstDiscovererInfo* info = gst_discoverer_discover_uri(discoverer, uri.c_str(), &error); + if (!info) { + if (error) g_error_free(error); + g_object_unref(discoverer); + return 0.0; + } + + GstClockTime duration = gst_discoverer_info_get_duration(info); + double duration_sec = (double)duration / GST_SECOND; + + gst_discoverer_info_unref(info); + g_object_unref(discoverer); + + return duration_sec; +} + +// Forward declarations for helpers defined in the Audio Encoder section below +static std::string imp_audio_get_encoder(const std::string& codec); +static std::string imp_audio_get_muxer(const std::string& codec); + +imp_status_t imp_audio_open(imp_audio_stream_t** stream, + const char* input_path, + const char* output_path, + const imp_audio_stream_opts_t* opts) { + if (!stream || !input_path) { + return IMP_ERROR_INVALID_ARGUMENT; + } + + auto* s = new imp_audio_stream_s(); + s->input_path = input_path; + if (output_path) { + s->output_path = output_path; + } + + if (opts) { + s->output_sample_rate = opts->sample_rate > 0 ? opts->sample_rate : 44100; + s->output_channels = opts->channels > 0 ? opts->channels : 2; + s->output_codec = opts->output_codec ? opts->output_codec : "mp3"; + s->output_bitrate_kbps = opts->output_bitrate_kbps > 0 ? opts->output_bitrate_kbps : 192; + } + + // Get duration from input file + s->duration_sec = imp_audio_get_duration(input_path); + if (s->duration_sec <= 0) { + s->last_error = "Failed to discover audio duration"; + delete s; + return IMP_ERROR_DECODE_FAILED; + } + + s->sample_rate = s->output_sample_rate; + s->channels = s->output_channels; + s->num_samples = (int64_t)(s->duration_sec * s->output_sample_rate); + + s->initialized = true; + *stream = s; + return IMP_OK; +} + +imp_status_t imp_audio_get_info(imp_audio_stream_t* stream, imp_audio_info_t* info) { + if (!stream || !info) { + return IMP_ERROR_INVALID_ARGUMENT; + } + + info->sample_rate = stream->sample_rate; + info->channels = stream->channels; + info->duration_sec = stream->duration_sec; + info->num_samples = stream->num_samples; + + return IMP_OK; +} + +imp_status_t imp_audio_process(imp_audio_stream_t* stream) { + if (!stream || !stream->initialized) { + return IMP_ERROR_INVALID_ARGUMENT; + } + + if (stream->output_path.empty()) { + stream->last_error = "No output path specified"; + return IMP_ERROR_INVALID_ARGUMENT; + } + + // Convert paths to forward slashes for GStreamer + std::string input_path = stream->input_path; + std::string output_path = stream->output_path; + std::replace(input_path.begin(), input_path.end(), '\\', '/'); + std::replace(output_path.begin(), output_path.end(), '\\', '/'); + + std::string encoder = imp_audio_get_encoder(stream->output_codec); + std::string muxer = imp_audio_get_muxer(stream->output_codec); + + // Build pipeline: filesrc → decodebin → audioconvert → audioresample → caps → encoder → [muxer] → filesink + std::string pipeline_str = + "filesrc location=\"" + input_path + "\" ! " + "decodebin ! " + "audioconvert ! " + "audioresample ! " + "audio/x-raw,format=S16LE,channels=" + std::to_string(stream->output_channels) + + ",rate=" + std::to_string(stream->output_sample_rate) + " ! "; + + if (encoder == "lamemp3enc") { + pipeline_str += "lamemp3enc bitrate=" + std::to_string(stream->output_bitrate_kbps) + " ! "; + } else if (encoder == "avenc_aac") { + pipeline_str += "avenc_aac bitrate=" + std::to_string(stream->output_bitrate_kbps * 1000) + " ! "; + } else if (encoder == "opusenc") { + pipeline_str += "opusenc bitrate=" + std::to_string(stream->output_bitrate_kbps * 1000) + " ! "; + } else { + pipeline_str += encoder + " ! "; + } + + if (!muxer.empty()) { + pipeline_str += muxer + " ! "; + } + + pipeline_str += "filesink location=\"" + output_path + "\""; + + GError* error = nullptr; + stream->pipeline = gst_parse_launch(pipeline_str.c_str(), &error); + + if (error || !stream->pipeline) { + stream->last_error = error ? error->message : "Pipeline creation failed"; + if (error) g_error_free(error); + return IMP_ERROR_DECODE_FAILED; + } + + auto start_time = chrono::high_resolution_clock::now(); + + GstStateChangeReturn ret = gst_element_set_state(stream->pipeline, GST_STATE_PLAYING); + if (ret == GST_STATE_CHANGE_FAILURE) { + stream->last_error = "Failed to start pipeline"; + gst_object_unref(stream->pipeline); + stream->pipeline = nullptr; + return IMP_ERROR_DECODE_FAILED; + } + + // Wait for EOS or error + GstBus* bus = gst_element_get_bus(stream->pipeline); + GstMessage* msg = gst_bus_timed_pop_filtered(bus, GST_CLOCK_TIME_NONE, + (GstMessageType)(GST_MESSAGE_EOS | GST_MESSAGE_ERROR)); + + auto end_time = chrono::high_resolution_clock::now(); + stream->wall_time_sec = chrono::duration(end_time - start_time).count(); + + imp_status_t status = IMP_OK; + if (msg) { + if (GST_MESSAGE_TYPE(msg) == GST_MESSAGE_ERROR) { + GError* err = nullptr; + gchar* debug = nullptr; + gst_message_parse_error(msg, &err, &debug); + stream->last_error = err ? err->message : "Unknown error"; + if (err) g_error_free(err); + if (debug) g_free(debug); + status = IMP_ERROR_ENCODE_FAILED; + } + gst_message_unref(msg); + } + + gst_object_unref(bus); + gst_element_set_state(stream->pipeline, GST_STATE_NULL); + gst_object_unref(stream->pipeline); + stream->pipeline = nullptr; + + stream->processed = (status == IMP_OK); + return status; +} + +imp_status_t imp_audio_get_timing(imp_audio_stream_t* stream, + double* wall_time_sec, + double* realtime_factor) { + if (!stream) { + return IMP_ERROR_INVALID_ARGUMENT; + } + + if (wall_time_sec) { + *wall_time_sec = stream->wall_time_sec; + } + if (realtime_factor) { + if (stream->wall_time_sec > 0) { + *realtime_factor = stream->duration_sec / stream->wall_time_sec; + } else { + *realtime_factor = 0.0; + } + } + + return IMP_OK; +} + +void imp_audio_close(imp_audio_stream_t* stream) { + if (!stream) return; + + if (stream->pipeline) { + gst_element_set_state(stream->pipeline, GST_STATE_NULL); + gst_object_unref(stream->pipeline); + } + + delete stream; +} + +imp_status_t imp_encode_image(void** d, size_t* s, imp_tensor_t* t, + imp_context_t* c, const imp_image_encode_opts_t* o, + imp_encode_callback_t cb, void* ud) { + (void)d;(void)s;(void)t;(void)c;(void)o;(void)cb;(void)ud; + return IMP_ERROR_INTERNAL; +} + +imp_status_t imp_encode_image_file(const char* f, imp_tensor_t* t, + imp_context_t* c, const imp_image_encode_opts_t* o, + imp_encode_callback_t cb, void* ud) { + (void)f;(void)t;(void)c;(void)o;(void)cb;(void)ud; + return IMP_ERROR_INTERNAL; +} + +////////////////////////////////////////////////////////////////////////////// +// Audio Encoder Implementation +////////////////////////////////////////////////////////////////////////////// + +// Helper: Get encoder element for codec +static std::string imp_audio_get_encoder(const std::string& codec) { + if (codec == "mp3") return "lamemp3enc"; + if (codec == "aac") return "avenc_aac"; + if (codec == "flac") return "flacenc"; + if (codec == "opus") return "opusenc"; + if (codec == "wav") return "wavenc"; + return "lamemp3enc"; // default +} + +// Helper: Get muxer element for codec (some don't need muxer) +static std::string imp_audio_get_muxer(const std::string& codec) { + if (codec == "aac") return "mp4mux"; + if (codec == "flac") return ""; // no muxer needed + if (codec == "wav") return ""; // wavenc is already containerized + if (codec == "opus") return "oggmux"; + return ""; // mp3 with lamemp3enc doesn't need muxer +} + +imp_status_t imp_audio_encoder_create(imp_audio_encoder_t** encoder, + const imp_audio_encode_opts_t* opts, + imp_encode_callback_t callback, + void* user_data) { + if (!encoder || !opts || !opts->output_path) { + return IMP_ERROR_INVALID_ARGUMENT; + } + + auto* e = new imp_audio_encoder_s(); + e->output_path = opts->output_path; + e->codec = opts->codec ? opts->codec : "mp3"; + e->bitrate_kbps = opts->bitrate_kbps > 0 ? opts->bitrate_kbps : 192; + e->sample_rate = opts->sample_rate > 0 ? opts->sample_rate : 44100; + e->channels = opts->channels > 0 ? opts->channels : 2; + e->callback = callback; + e->user_data = user_data; + + // Convert path to forward slashes for GStreamer + std::string output_path = e->output_path; + std::replace(output_path.begin(), output_path.end(), '\\', '/'); + + // Get encoder element + std::string enc_element = imp_audio_get_encoder(e->codec); + std::string muxer = imp_audio_get_muxer(e->codec); + + // Build pipeline: appsrc → audioconvert → audioresample → encoder → [muxer] → filesink + std::string pipeline_str = + "appsrc name=src format=time ! " + "audioconvert ! " + "audioresample ! " + "audio/x-raw,format=S16LE,channels=" + std::to_string(e->channels) + + ",rate=" + std::to_string(e->sample_rate) + " ! "; + + // Add encoder with appropriate options + if (enc_element == "lamemp3enc") { + pipeline_str += "lamemp3enc bitrate=" + std::to_string(e->bitrate_kbps) + " ! "; + } else if (enc_element == "avenc_aac") { + pipeline_str += "avenc_aac bitrate=" + std::to_string(e->bitrate_kbps * 1000) + " ! "; + } else if (enc_element == "opusenc") { + pipeline_str += "opusenc bitrate=" + std::to_string(e->bitrate_kbps * 1000) + " ! "; + } else { + pipeline_str += enc_element + " ! "; + } + + // Add muxer if needed + if (!muxer.empty()) { + pipeline_str += muxer + " ! "; + } + + pipeline_str += "filesink location=\"" + output_path + "\""; + + // Create pipeline + GError* error = nullptr; + e->pipeline = gst_parse_launch(pipeline_str.c_str(), &error); + + if (error || !e->pipeline) { + e->last_error = error ? error->message : "Pipeline creation failed"; + if (error) g_error_free(error); + delete e; + return IMP_ERROR_ENCODE_FAILED; + } + + // Get appsrc element + e->appsrc = gst_bin_get_by_name(GST_BIN(e->pipeline), "src"); + if (!e->appsrc) { + e->last_error = "Failed to get appsrc element"; + gst_object_unref(e->pipeline); + delete e; + return IMP_ERROR_INTERNAL; + } + + // Configure appsrc caps + GstCaps* caps = gst_caps_new_simple("audio/x-raw", + "format", G_TYPE_STRING, "S16LE", + "rate", G_TYPE_INT, (int)e->sample_rate, + "channels", G_TYPE_INT, (int)e->channels, + "layout", G_TYPE_STRING, "interleaved", + nullptr); + g_object_set(e->appsrc, "caps", caps, "is-live", FALSE, nullptr); + gst_caps_unref(caps); + + // Start pipeline + GstStateChangeReturn ret = gst_element_set_state(e->pipeline, GST_STATE_PLAYING); + if (ret == GST_STATE_CHANGE_FAILURE) { + e->last_error = "Failed to start pipeline"; + gst_object_unref(e->appsrc); + gst_object_unref(e->pipeline); + delete e; + return IMP_ERROR_ENCODE_FAILED; + } + + e->initialized = true; + *encoder = e; + return IMP_OK; +} + +imp_status_t imp_audio_encoder_write(imp_audio_encoder_t* encoder, + imp_tensor_t* tensor) { + if (!encoder || !encoder->initialized || !tensor) { + return IMP_ERROR_INVALID_ARGUMENT; + } + + // TODO: Extract audio data from tensor and push to appsrc + // For now, this is a stub - real implementation needs tensor audio data access + (void)tensor; + + return IMP_ERROR_INTERNAL; // Not fully implemented yet +} + +void imp_audio_encoder_close(imp_audio_encoder_t* encoder) { + if (!encoder) return; + + if (encoder->appsrc) { + // Signal end of stream + gst_app_src_end_of_stream(GST_APP_SRC(encoder->appsrc)); + } + + if (encoder->pipeline) { + // Wait for EOS to propagate + GstBus* bus = gst_element_get_bus(encoder->pipeline); + if (bus) { + GstMessage* msg = gst_bus_timed_pop_filtered(bus, 5 * GST_SECOND, + (GstMessageType)(GST_MESSAGE_EOS | GST_MESSAGE_ERROR)); + + if (msg) { + if (GST_MESSAGE_TYPE(msg) == GST_MESSAGE_ERROR && encoder->callback) { + // Call callback with error + encoder->callback(IMP_ERROR_ENCODE_FAILED, nullptr, 0, encoder->user_data); + } else if (encoder->callback) { + // Call callback with success (no data returned for file output) + encoder->callback(IMP_OK, nullptr, 0, encoder->user_data); + } + gst_message_unref(msg); + } + gst_object_unref(bus); + } + + gst_element_set_state(encoder->pipeline, GST_STATE_NULL); + + if (encoder->appsrc) { + gst_object_unref(encoder->appsrc); + } + gst_object_unref(encoder->pipeline); + } + + delete encoder; +} + +// --------------------------------------------------------------------------- +// One-shot audio encode (to memory) +// --------------------------------------------------------------------------- + +// Internal: write a WAV file into a malloc'd buffer (no GStreamer) +static imp_status_t imp_encode_audio_wav(void** data, size_t* data_size, + const float* samples, size_t num_samples, + uint32_t sampleRate, uint32_t channels) { + uint32_t byteRate = sampleRate * channels * sizeof(float); + uint32_t blockAlign = channels * static_cast(sizeof(float)); + uint32_t dataSize = static_cast(num_samples * sizeof(float)); + uint32_t chunkSize = 36 + dataSize; + size_t totalSize = 44 + dataSize; + + uint8_t* buf = static_cast(malloc(totalSize)); + if (!buf) return IMP_ERROR_INTERNAL; + + auto w16 = [](uint8_t* p, uint16_t v) { p[0]=v&0xFF; p[1]=(v>>8)&0xFF; }; + auto w32 = [](uint8_t* p, uint32_t v) { p[0]=v&0xFF; p[1]=(v>>8)&0xFF; p[2]=(v>>16)&0xFF; p[3]=(v>>24)&0xFF; }; + + memcpy(buf, "RIFF", 4); w32(buf+4, chunkSize); + memcpy(buf+8, "WAVE", 4); + memcpy(buf+12, "fmt ", 4); w32(buf+16, 16); + w16(buf+20, 3); // IEEE float + w16(buf+22, static_cast(channels)); + w32(buf+24, sampleRate); + w32(buf+28, byteRate); + w16(buf+32, static_cast(blockAlign)); + w16(buf+34, 32); // bits per sample + memcpy(buf+36, "data", 4); w32(buf+40, dataSize); + memcpy(buf+44, samples, dataSize); + + *data = buf; + *data_size = totalSize; + return IMP_OK; +} + +imp_status_t imp_encode_audio(void** data, + size_t* data_size, + const float* samples, + size_t num_samples, + const imp_audio_encode_opts_t* opts) { + if (!data || !data_size || !samples || num_samples == 0 || !opts) + return IMP_ERROR_INVALID_ARGUMENT; + + std::string codec = opts->codec ? opts->codec : "wav"; + uint32_t sampleRate = opts->sample_rate > 0 ? opts->sample_rate : 16000; + uint32_t channels = opts->channels > 0 ? opts->channels : 1; + + // ----- WAV: built-in writer (fast, no GStreamer) ----- + if (codec == "wav") { + return imp_encode_audio_wav(data, data_size, samples, num_samples, + sampleRate, channels); + } + + // ----- PCM: raw float bytes ----- + if (codec == "pcm") { + size_t sz = num_samples * sizeof(float); + *data = malloc(sz); + if (!*data) return IMP_ERROR_INTERNAL; + memcpy(*data, samples, sz); + *data_size = sz; + return IMP_OK; + } + + // ----- Lossy / lossless codecs via GStreamer ----- + gst_init(nullptr, nullptr); + + std::string encElement; + std::string muxElement; + uint32_t bitrate = opts->bitrate_kbps > 0 ? opts->bitrate_kbps : 192; + + if (codec == "mp3") { + encElement = "lamemp3enc bitrate=" + std::to_string(bitrate); + } else if (codec == "flac") { + encElement = "flacenc"; + } else if (codec == "opus") { + // Opus standard sample rates: 8k, 12k, 16k, 24k, 48k + uint32_t opusRate = (sampleRate <= 8000) ? 8000 : + (sampleRate <= 12000) ? 12000 : + (sampleRate <= 16000) ? 16000 : + (sampleRate <= 24000) ? 24000 : 48000; + encElement = "opusenc bitrate=" + std::to_string(bitrate * 1000); + // audioresample will convert to opusRate automatically via caps + muxElement = "oggmux"; + sampleRate = opusRate; // override for caps + } else if (codec == "aac") { + encElement = "avenc_aac bitrate=" + std::to_string(bitrate * 1000); + muxElement = "aacparse ! adtsmux"; + } else { + return IMP_ERROR_INVALID_ARGUMENT; + } + + std::string pipeStr = + "appsrc name=src format=time ! " + "audioconvert ! audioresample ! " + "audio/x-raw,format=S16LE,channels=" + std::to_string(channels) + + ",rate=" + std::to_string(sampleRate) + " ! " + + encElement + " ! "; + if (!muxElement.empty()) pipeStr += muxElement + " ! "; + pipeStr += "appsink name=sink"; + + GError* error = nullptr; + GstElement* pipeline = gst_parse_launch(pipeStr.c_str(), &error); + if (error || !pipeline) { + if (error) g_error_free(error); + if (pipeline) gst_object_unref(pipeline); + return IMP_ERROR_ENCODE_FAILED; + } + + GstElement* appsrc = gst_bin_get_by_name(GST_BIN(pipeline), "src"); + GstElement* appsink = gst_bin_get_by_name(GST_BIN(pipeline), "sink"); + if (!appsrc || !appsink) { + if (appsrc) gst_object_unref(appsrc); + if (appsink) gst_object_unref(appsink); + gst_object_unref(pipeline); + return IMP_ERROR_INTERNAL; + } + + // Configure appsrc for F32LE input + GstCaps* caps = gst_caps_new_simple("audio/x-raw", + "format", G_TYPE_STRING, "F32LE", + "rate", G_TYPE_INT, (int)(opts->sample_rate > 0 ? opts->sample_rate : 16000), + "channels", G_TYPE_INT, (int)channels, + "layout", G_TYPE_STRING, "interleaved", + nullptr); + g_object_set(appsrc, "caps", caps, "is-live", FALSE, nullptr); + gst_caps_unref(caps); + + gst_element_set_state(pipeline, GST_STATE_PLAYING); + + // Push float samples + size_t byteSize = num_samples * sizeof(float); + GstBuffer* buffer = gst_buffer_new_allocate(nullptr, byteSize, nullptr); + GstMapInfo map; + gst_buffer_map(buffer, &map, GST_MAP_WRITE); + memcpy(map.data, samples, byteSize); + gst_buffer_unmap(buffer, &map); + + GST_BUFFER_PTS(buffer) = 0; + GST_BUFFER_DURATION(buffer) = gst_util_uint64_scale( + num_samples / channels, GST_SECOND, opts->sample_rate > 0 ? opts->sample_rate : 16000); + + gst_app_src_push_buffer(GST_APP_SRC(appsrc), buffer); // takes ownership + gst_app_src_end_of_stream(GST_APP_SRC(appsrc)); + + // Collect encoded output from appsink + std::vector encoded; + for (;;) { + GstSample* sample = gst_app_sink_try_pull_sample(GST_APP_SINK(appsink), 10 * GST_SECOND); + if (!sample) { + // Check if EOS was reached (normal completion) + if (gst_app_sink_is_eos(GST_APP_SINK(appsink))) break; + // Timeout — abort + break; + } + GstBuffer* outBuf = gst_sample_get_buffer(sample); + GstMapInfo outMap; + if (gst_buffer_map(outBuf, &outMap, GST_MAP_READ)) { + encoded.insert(encoded.end(), outMap.data, outMap.data + outMap.size); + gst_buffer_unmap(outBuf, &outMap); + } + gst_sample_unref(sample); + } + + gst_element_set_state(pipeline, GST_STATE_NULL); + gst_object_unref(appsrc); + gst_object_unref(appsink); + gst_object_unref(pipeline); + + if (encoded.empty()) return IMP_ERROR_ENCODE_FAILED; + + *data = malloc(encoded.size()); + if (!*data) return IMP_ERROR_INTERNAL; + memcpy(*data, encoded.data(), encoded.size()); + *data_size = encoded.size(); + return IMP_OK; +} + +imp_status_t imp_encode_audio_file(const char* file_path, + const float* samples, + size_t num_samples, + const imp_audio_encode_opts_t* opts) { + // Encode to memory, then write to file + void* data = nullptr; + size_t data_size = 0; + imp_status_t st = imp_encode_audio(&data, &data_size, samples, num_samples, opts); + if (st != IMP_OK) return st; + + FILE* fp = fopen(file_path, "wb"); + if (!fp) { free(data); return IMP_ERROR_INTERNAL; } + fwrite(data, 1, data_size, fp); + fclose(fp); + free(data); + return IMP_OK; +} + diff --git a/src/mpi/intel_mpi.h b/src/mpi/intel_mpi.h new file mode 100644 index 0000000000..168d1562aa --- /dev/null +++ b/src/mpi/intel_mpi.h @@ -0,0 +1,1013 @@ +/** + * Intel Media Processing Interface (Intel MPI) + * + * Lightweight API for media decode/encode with zero-copy OpenVINO integration. + * Designed for Intel GPU acceleration with minimal CPU-GPU memory transfers. + * + * Version: 0.1.0 + */ + +#ifndef INTEL_MPI_H +#define INTEL_MPI_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +////////////////////////////////////////////////////////////////////////////// +// Forward Declarations (OpenVINO C API types) +////////////////////////////////////////////////////////////////////////////// + +typedef struct ov_compiled_model ov_compiled_model_t; +typedef struct ov_remote_context ov_remote_context_t; + +////////////////////////////////////////////////////////////////////////////// +// Device Types +////////////////////////////////////////////////////////////////////////////// + +typedef enum { + IMP_DEVICE_AUTO = 0, // Auto-detect from context (match inference device) + IMP_DEVICE_CPU = 1, // CPU (host memory) + IMP_DEVICE_GPU = 2, // Intel GPU + IMP_DEVICE_NPU = 3, // Intel NPU +} imp_device_type_t; + +////////////////////////////////////////////////////////////////////////////// +// Tensor Handle +////////////////////////////////////////////////////////////////////////////// + +/** + * Opaque tensor handle (pointer type) + * + * Wraps either ov::Tensor (CPU) or ov::RemoteTensor (GPU/NPU). + * Use imp_tensor_get_*() functions to query properties. + * + * NOTE: Despite the abstraction, the underlying tensor type matters: + * - CPU context: wraps ov::Tensor (host memory) + * - GPU/NPU context: wraps ov::RemoteTensor (device memory) + * Use imp_tensor_get_device_type() to determine the actual type, + * and imp_tensor_get_ov() to access the underlying OpenVINO tensor. + */ +typedef struct imp_tensor_s imp_tensor_t; + +////////////////////////////////////////////////////////////////////////////// +// Version +////////////////////////////////////////////////////////////////////////////// + +#define IMP_VERSION_MAJOR 0 +#define IMP_VERSION_MINOR 1 +#define IMP_VERSION_PATCH 0 + +////////////////////////////////////////////////////////////////////////////// +// Error Codes +////////////////////////////////////////////////////////////////////////////// + +typedef enum { + IMP_OK = 0, + IMP_ERROR_INVALID_ARGUMENT = -1, + IMP_ERROR_OUT_OF_MEMORY = -2, + IMP_ERROR_DEVICE_NOT_AVAILABLE = -3, + IMP_ERROR_UNSUPPORTED_FORMAT = -4, + IMP_ERROR_DECODE_FAILED = -5, + IMP_ERROR_ENCODE_FAILED = -6, + IMP_ERROR_CONTEXT_MISMATCH = -7, + IMP_ERROR_STREAM_END = -8, + IMP_ERROR_TIMEOUT = -9, + IMP_ERROR_INTERNAL = -99 +} imp_status_t; + +////////////////////////////////////////////////////////////////////////////// +// Context Types +////////////////////////////////////////////////////////////////////////////// + +typedef enum { + IMP_CONTEXT_OPENCL = 0, // OpenCL context (Linux/Windows) + IMP_CONTEXT_D3D11 = 1, // Direct3D 11 (Windows) + IMP_CONTEXT_VAAPI = 2, // VA-API (Linux) +} imp_context_type_t; + +////////////////////////////////////////////////////////////////////////////// +// Data Types +////////////////////////////////////////////////////////////////////////////// + +typedef enum { + IMP_TYPE_U8 = 0, + IMP_TYPE_U16 = 1, + IMP_TYPE_FP16 = 2, + IMP_TYPE_FP32 = 3, + IMP_TYPE_I8 = 4, + IMP_TYPE_I32 = 5 +} imp_element_type_t; + +typedef enum { + IMP_FORMAT_RGB = 0, + IMP_FORMAT_BGR = 1, + IMP_FORMAT_NV12 = 2, // YUV 4:2:0, two planes (Y + interleaved UV) + IMP_FORMAT_I420 = 3, // YUV 4:2:0, three planes + IMP_FORMAT_GRAY = 4, + IMP_FORMAT_RGBA = 5, + IMP_FORMAT_BGRA = 6 +} imp_pixel_format_t; + +typedef enum { + IMP_LAYOUT_NHWC = 0, // [batch, height, width, channels] + IMP_LAYOUT_NCHW = 1, // [batch, channels, height, width] + IMP_LAYOUT_HWC = 2, // [height, width, channels] + IMP_LAYOUT_CHW = 3 // [channels, height, width] +} imp_layout_t; + +////////////////////////////////////////////////////////////////////////////// +// Context +////////////////////////////////////////////////////////////////////////////// + +/** + * Opaque context handle (pointer type) + */ +typedef struct imp_context_s imp_context_t; + +/** + * Create context from OpenVINO compiled model + * Extracts GPU context from model for zero-copy tensor sharing. + * + * @param ctx Output: context handle pointer + * @param compiled_model OpenVINO compiled model (must be GPU-compiled) + * @return Status code + */ +imp_status_t imp_context_create(imp_context_t** ctx, + ov_compiled_model_t* compiled_model); + +/** + * Create context from OpenVINO remote context + * Use when sharing context between multiple models. + * + * @param ctx Output: context handle pointer + * @param remote_ctx OpenVINO remote context + * @param type Context backend type + * @return Status code + */ +imp_status_t imp_context_create_from_remote(imp_context_t** ctx, + ov_remote_context_t* remote_ctx, + imp_context_type_t type); + +/** + * Get underlying native handle + * + * @param ctx Context handle + * @param type Output: context type + * @param native_handle Output: native handle (cl_context, ID3D11Device*, etc.) + * @return Status code + */ +imp_status_t imp_context_get_native(imp_context_t* ctx, + imp_context_type_t* type, + void** native_handle); + +/** + * Get OpenVINO remote context + * + * @param ctx Context handle + * @param remote_ctx Output: OpenVINO remote context + * @return Status code + */ +imp_status_t imp_context_get_ov_remote(imp_context_t* ctx, + ov_remote_context_t** remote_ctx); + +/** + * Get device type the context is configured for + * + * @param ctx Context handle + * @param device_type Output: device type + * @return Status code + */ +imp_status_t imp_context_get_device_type(imp_context_t* ctx, + imp_device_type_t* device_type); + +/** + * Get device name (e.g., "GPU.0", "GPU.1", "CPU", "NPU") + * + * @param ctx Context handle + * @param device_name Output: device name string (valid until context destroyed) + * @return Status code + */ +imp_status_t imp_context_get_device_name(imp_context_t* ctx, + const char** device_name); + +/** + * Destroy context + * + * @param ctx Context handle + */ +void imp_context_destroy(imp_context_t* ctx); + +// TODO: imp_context_create_from_device(imp_context_t** ctx, const char* device_name) +// Creates a pure media context for decode/encode without a model. +// Needed for transcode workflows with no inference. + +/** + * Get last error message for context + * + * @param ctx Context handle + * @return Error message string (valid until next API call) + */ +const char* imp_context_get_error(imp_context_t* ctx); + +////////////////////////////////////////////////////////////////////////////// +// Video Source Configuration +////////////////////////////////////////////////////////////////////////////// + +typedef enum { + IMP_SOURCE_FILE = 0, // Local file path + IMP_SOURCE_URL = 1, // Network URL (rtsp://, http://, udp://) + IMP_SOURCE_CAMERA = 2, // Camera device +} imp_source_type_t; + +/** + * Opaque video source configuration handle (pointer type) + */ +typedef struct imp_video_source_s imp_video_source_t; + +/** + * Create video source configuration + * + * @param source Output: source configuration handle pointer + * @param type Source type + * @return Status code + */ +imp_status_t imp_video_source_create(imp_video_source_t** source, + imp_source_type_t type); + +/** + * Set source property + * + * Standard keys by source type: + * IMP_SOURCE_FILE: + * - "path": file path (required) + * IMP_SOURCE_URL: + * - "url": stream URL (required) + * - "transport": "tcp" or "udp" (RTSP only) + * - "username": auth username + * - "password": auth password + * - "timeout": connection timeout in ms + * - "low_latency": "1" to enable + * IMP_SOURCE_CAMERA: + * - "device": device index ("0", "1") or path ("/dev/video0") + * - "width": preferred width + * - "height": preferred height + * - "framerate": preferred FPS + * - "format": capture format ("mjpeg", "nv12", "yuyv") + * + * @param source Source configuration handle + * @param key Property key + * @param value Property value (as string) + * @return Status code + */ +imp_status_t imp_video_source_set(imp_video_source_t* source, + const char* key, + const char* value); + +/** + * Destroy source configuration + * + * @param source Source configuration handle + */ +void imp_video_source_destroy(imp_video_source_t* source); + +////////////////////////////////////////////////////////////////////////////// +// Decode Configuration +////////////////////////////////////////////////////////////////////////////// + +/** + * Image decode options + */ +typedef struct { + imp_pixel_format_t output_format; // Desired output format (default: RGB) + imp_layout_t output_layout; // Tensor layout (default: NHWC) + imp_element_type_t output_type; // Element type (default: U8) + uint32_t resize_width; // 0 = keep original + uint32_t resize_height; // 0 = keep original + const char* decode_device; // Where decoding runs: "CPU", "GPU", "GPU.0", "GPU.1", "NPU", NULL = match context + const char* output_device; // Where result tensor is placed: NULL = same as decode_device +} imp_image_decode_opts_t; + +/** + * Video decode output branch + * + * Defines one output branch at a specific resolution/format. + * Multiple branches share a single decode + tee pipeline on GPU. + * Width/height of 0 means: deduce from context model dims. + * If context has no model and dims are 0, source resolution is used. + */ +typedef struct { + uint32_t width; // Output width (0 = deduce from model, or source res) + uint32_t height; // Output height (0 = deduce from model, or source res) + imp_pixel_format_t format; // Output format (default: NV12) + const char* name; // Optional branch name (for imp_video_read_frame_by_name) +} imp_video_branch_t; + +/** + * Video stream configuration + * + * If branches is NULL and branch_count is 0, a single branch at source + * resolution is created automatically. + */ +typedef struct { + imp_pixel_format_t output_format; // Base format after decode (default: NV12) + imp_layout_t output_layout; // Tensor layout (default: NHWC) + imp_element_type_t output_type; // Element type (default: U8) + const char* decode_device; // Where decoding runs: "CPU", "GPU", "GPU.0", "GPU.1", "NPU", NULL = match context + const char* output_device; // Where result tensor is placed: NULL = same as decode_device + uint32_t buffer_count; // Frame buffer pool size (default: 4) + int64_t timeout_ms; // Read timeout, -1 = infinite + const imp_video_branch_t* branches; // Array of output branches (NULL = single branch at source res) // TODO FIXME do we need more than one? + uint32_t branch_count; // Number of branches (0 = single branch at source res) +} imp_video_decode_opts_t; + +/** + * Audio decode options + */ +typedef struct { + uint32_t sample_rate; // Target sample rate (0 = keep original) + uint32_t channels; // Target channels (0 = keep original) + imp_element_type_t output_type; // Element type (default: FP32) + bool normalize; // Normalize to [-1,1] for float types (default: true) +} imp_audio_decode_opts_t; + +////////////////////////////////////////////////////////////////////////////// +// Encode Configuration +////////////////////////////////////////////////////////////////////////////// + +/** + * Image encode options + */ +typedef struct { + const char* format; // "jpeg", "png", "bmp" + int quality; // JPEG quality 1-100 (default: 90) + const char* input_device; // Expected input tensor location: NULL = accept any, copy if needed + const char* encode_device; // Where encoding runs: "CPU", "GPU", "GPU.0", etc., NULL = match context +} imp_image_encode_opts_t; + +/** + * Video encode options + */ +typedef struct { + const char* codec; // "h264", "h265", "av1" + uint32_t bitrate_kbps; // Target bitrate + uint32_t framerate; // Frames per second + const char* input_device; // Expected input tensor location: NULL = accept any, copy if needed + const char* encode_device; // Where encoding runs: "CPU", "GPU", "GPU.0", etc., NULL = match context + const char* output_path; // Output file path (NULL for memory) +} imp_video_encode_opts_t; + +////////////////////////////////////////////////////////////////////////////// +// Async Callback Types +////////////////////////////////////////////////////////////////////////////// + +/** + * Decode completion callback + * + * @param status Operation status + * @param tensor Output tensor (owned by caller after callback) + * @param user_data User-provided data + */ +typedef void (*imp_decode_callback_t)(imp_status_t status, + imp_tensor_t* tensor, + void* user_data); + +/** + * Video frame callback (for streaming) + * + * @param status Operation status (IMP_ERROR_STREAM_END when done) + * @param tensor Frame tensor (valid only during callback) + * @param frame_index Frame number (0-based) + * @param timestamp_us Presentation timestamp in microseconds + * @param user_data User-provided data + * @return true to continue, false to stop stream + */ +typedef bool (*imp_video_frame_callback_t)(imp_status_t status, + imp_tensor_t* tensor, + uint64_t frame_index, + int64_t timestamp_us, + void* user_data); + +/** + * Encode completion callback + * + * @param status Operation status + * @param data Encoded data (owned by caller after callback) + * @param size Data size in bytes + * @param user_data User-provided data + */ +typedef void (*imp_encode_callback_t)(imp_status_t status, + void* data, + size_t size, + void* user_data); + +////////////////////////////////////////////////////////////////////////////// +// Image Decode +////////////////////////////////////////////////////////////////////////////// + +/** + * Decode image from memory buffer + * + * @param tensor Output: tensor handle pointer (CPU or GPU depending on context/options) + * @param data Encoded image data (JPEG, PNG, etc.) + * @param size Data size in bytes + * @param ctx Context handle + * @param opts Decode options (NULL for defaults) + * @param callback Async callback (NULL for synchronous) + * @param user_data User data for callback + * @return Status code (IMP_OK if async started successfully) + */ +imp_status_t imp_decode_image(imp_tensor_t** tensor, + const void* data, + size_t size, + imp_context_t* ctx, + const imp_image_decode_opts_t* opts, + imp_decode_callback_t callback, + void* user_data); + +/** + * Decode image from file + * + * @param tensor Output: tensor handle pointer (CPU or GPU depending on context/options) + * @param file_path Path to image file + * @param ctx Context handle + * @param opts Decode options (NULL for defaults) + * @param callback Async callback (NULL for synchronous) + * @param user_data User data for callback + * @return Status code + */ +imp_status_t imp_decode_image_file(imp_tensor_t** tensor, + const char* file_path, + imp_context_t* ctx, + const imp_image_decode_opts_t* opts, + imp_decode_callback_t callback, + void* user_data); + +////////////////////////////////////////////////////////////////////////////// +// Video Decode +////////////////////////////////////////////////////////////////////////////// + +/** + * Opaque video stream handle (pointer type) + */ +typedef struct imp_video_stream_s imp_video_stream_t; + +/** + * Open video stream from source configuration + * + * @param stream Output: stream handle pointer + * @param source Source configuration (ownership transferred, do not destroy separately) + * @param ctx Context handle + * @param opts Stream options (NULL for defaults) + * @return Status code + */ +imp_status_t imp_video_open(imp_video_stream_t** stream, + imp_video_source_t* source, + imp_context_t* ctx, + const imp_video_decode_opts_t* opts); + +/** + * Read next frame from a specific branch + * + * @param tensor Output: frame tensor pointer (NV12 data, CPU side) + * @param stream Stream handle + * @param branch_index Branch index (0-based, must be < branch_count) + * @return Status code (IMP_ERROR_STREAM_END when no more frames) + */ +imp_status_t imp_video_read_frame(imp_tensor_t** tensor, + imp_video_stream_t* stream, + uint32_t branch_index); + +/** + * Read next frame from a named branch + * + * @param tensor Output: frame tensor pointer (NV12 data, CPU side) + * @param stream Stream handle + * @param branch_name Branch name (as specified in imp_video_branch_t) + * @return Status code (IMP_ERROR_STREAM_END when no more frames) + */ +imp_status_t imp_video_read_frame_by_name(imp_tensor_t** tensor, + imp_video_stream_t* stream, + const char* branch_name); + +/** + * Start async frame processing with callback + * + * @param stream Stream handle + * @param callback Frame callback + * @param user_data User data for callback + * @return Status code + */ +imp_status_t imp_video_start_async(imp_video_stream_t* stream, + imp_video_frame_callback_t callback, + void* user_data); + +/** + * Stop async processing + * + * @param stream Stream handle + */ +void imp_video_stop(imp_video_stream_t* stream); + +/** + * Get video metadata + * + * @param stream Stream handle + * @param width Output: frame width + * @param height Output: frame height + * @param fps Output: frames per second + * @param frame_count Output: total frames (-1 if unknown/live) + * @return Status code + */ +imp_status_t imp_video_get_info(imp_video_stream_t* stream, + uint32_t* width, + uint32_t* height, + float* fps, + int64_t* frame_count); + +/** + * Close video stream + * + * @param stream Stream handle + */ +void imp_video_close(imp_video_stream_t* stream); + +/** + * Query video file metadata + * + * Use before decode to determine file properties. + * + * @param file_path Path to video file + * @param width Output: frame width (can be NULL) + * @param height Output: frame height (can be NULL) + * @param fps Output: frames per second (can be NULL) + * @param frame_count Output: total frames, -1 if unknown (can be NULL) + * @param duration_sec Output: duration in seconds (can be NULL) + * @return Status code + */ +imp_status_t imp_video_file_info(const char* file_path, + uint32_t* width, + uint32_t* height, + float* fps, + int64_t* frame_count, + double* duration_sec); + +// TODO: One-shot video decode (batch decode entire video at once) +// Decode entire video to 4D tensor [frames, H, W, C] +// imp_status_t imp_decode_video(imp_tensor_t** tensor, const void* data, size_t size, ...); +// imp_status_t imp_decode_video_file(imp_tensor_t** tensor, const char* file_path, ...); + +////////////////////////////////////////////////////////////////////////////// +// Audio Decode +////////////////////////////////////////////////////////////////////////////// + +/** + * Decode audio from memory buffer + * + * @param tensor Output: tensor handle pointer with audio samples + * @param data Encoded audio data + * @param size Data size in bytes + * @param ctx Context handle + * @param opts Decode options (NULL for defaults) + * @param callback Async callback (NULL for synchronous) + * @param user_data User data for callback + * @return Status code + */ +imp_status_t imp_decode_audio(imp_tensor_t** tensor, + const void* data, + size_t size, + imp_context_t* ctx, + const imp_audio_decode_opts_t* opts, + imp_decode_callback_t callback, + void* user_data); + +/** + * Decode audio from file + * + * @param tensor Output: tensor handle pointer with audio samples + * @param file_path Path to audio file + * @param ctx Context handle + * @param opts Decode options (NULL for defaults) + * @param callback Async callback (NULL for synchronous) + * @param user_data User data for callback + * @return Status code + */ +imp_status_t imp_decode_audio_file(imp_tensor_t** tensor, + const char* file_path, + imp_context_t* ctx, + const imp_audio_decode_opts_t* opts, + imp_decode_callback_t callback, + void* user_data); + +/** + * Query audio file metadata + * + * Use before decode to determine file properties and allocate appropriately. + * + * @param file_path Path to audio file + * @param sample_rate Output: sample rate in Hz (can be NULL) + * @param channels Output: number of channels (can be NULL) + * @param duration_sec Output: duration in seconds (can be NULL) + * @return Status code + */ +imp_status_t imp_audio_file_info(const char* file_path, + uint32_t* sample_rate, + uint32_t* channels, + double* duration_sec); + +////////////////////////////////////////////////////////////////////////////// +// Audio Stream (high-level decode → encode pipeline) +////////////////////////////////////////////////////////////////////////////// + +/** + * Audio stream options + */ +typedef struct { + uint32_t sample_rate; // Output sample rate (0 = 44100) + uint32_t channels; // Output channels (0 = 2) + const char* output_codec; // "mp3","aac","flac","wav","opus" (NULL = "mp3") + uint32_t output_bitrate_kbps; // Bitrate for lossy codecs (0 = 192) + bool expose_samples; // Reserved for future use +} imp_audio_stream_opts_t; + +/** + * Audio stream info (returned by imp_audio_get_info) + */ +typedef struct { + uint32_t sample_rate; + uint32_t channels; + double duration_sec; + int64_t num_samples; +} imp_audio_info_t; + +/** + * Opaque audio stream handle (pointer type) + */ +typedef struct imp_audio_stream_s imp_audio_stream_t; + +/** + * Open audio stream for processing + * + * Creates a decode → encode pipeline for file-based audio processing. + * + * @param stream Output: stream handle pointer + * @param input_path Input audio file path + * @param output_path Output audio file path (NULL for decode-only) + * @param opts Stream options (NULL for defaults: 44100 Hz, stereo, MP3 192kbps) + * @return Status code + */ +imp_status_t imp_audio_open(imp_audio_stream_t** stream, + const char* input_path, + const char* output_path, + const imp_audio_stream_opts_t* opts); + +/** + * Get audio stream info (duration, sample rate, channels) + * + * @param stream Stream handle + * @param info Output: audio info structure + * @return Status code + */ +imp_status_t imp_audio_get_info(imp_audio_stream_t* stream, + imp_audio_info_t* info); + +/** + * Process audio stream (run decode → encode pipeline to completion) + * + * @param stream Stream handle + * @return Status code + */ +imp_status_t imp_audio_process(imp_audio_stream_t* stream); + +/** + * Get processing timing information + * + * @param stream Stream handle + * @param wall_time_sec Output: wall-clock time in seconds + * @param realtime_factor Output: audio_duration / wall_time (>1 = faster than realtime) + * @return Status code + */ +imp_status_t imp_audio_get_timing(imp_audio_stream_t* stream, + double* wall_time_sec, + double* realtime_factor); + +/** + * Close audio stream and free resources + * + * @param stream Stream handle + */ +void imp_audio_close(imp_audio_stream_t* stream); + +////////////////////////////////////////////////////////////////////////////// +// Image Encode +////////////////////////////////////////////////////////////////////////////// + +/** + * Encode tensor to image format + * + * @param data Output: encoded data (caller must free with imp_free) + * @param size Output: data size + * @param tensor Input tensor + * @param ctx Context handle + * @param opts Encode options (NULL for defaults: JPEG quality 90) + * @param callback Async callback (NULL for synchronous) + * @param user_data User data for callback + * @return Status code + */ +imp_status_t imp_encode_image(void** data, + size_t* size, + imp_tensor_t* tensor, + imp_context_t* ctx, + const imp_image_encode_opts_t* opts, + imp_encode_callback_t callback, + void* user_data); + +/** + * Encode tensor to image file + * + * @param file_path Output file path (format inferred from extension) + * @param tensor Input tensor + * @param ctx Context handle + * @param opts Encode options (NULL for defaults) + * @param callback Async callback (NULL for synchronous) + * @param user_data User data for callback + * @return Status code + */ +imp_status_t imp_encode_image_file(const char* file_path, + imp_tensor_t* tensor, + imp_context_t* ctx, + const imp_image_encode_opts_t* opts, + imp_encode_callback_t callback, + void* user_data); + +////////////////////////////////////////////////////////////////////////////// +// Video Encode +////////////////////////////////////////////////////////////////////////////// + +/** + * Opaque video encoder handle (pointer type) + */ +typedef struct imp_video_encoder_s imp_video_encoder_t; + +/** + * Create video encoder + * + * @param encoder Output: encoder handle pointer + * @param width Frame width + * @param height Frame height + * @param ctx Context handle + * @param opts Encode options + * @return Status code + */ +imp_status_t imp_video_encoder_create(imp_video_encoder_t** encoder, + uint32_t width, + uint32_t height, + imp_context_t* ctx, + const imp_video_encode_opts_t* opts); + +/** + * Encode frame + * + * @param encoder Encoder handle + * @param tensor Frame tensor + * @return Status code + */ +imp_status_t imp_video_encoder_write(imp_video_encoder_t* encoder, + imp_tensor_t* tensor); + +/** + * Finalize and close encoder + * + * @param encoder Encoder handle + */ +void imp_video_encoder_close(imp_video_encoder_t* encoder); + +// TODO: One-shot video encode (batch encode entire video at once) +// Encode 4D tensor [frames, H, W, C] to video +// To memory (e.g. for network send): +// imp_status_t imp_encode_video(void** data, size_t* size, imp_tensor_t* tensor, ...); +// To file: +// imp_status_t imp_encode_video_file(const char* file_path, imp_tensor_t* tensor, ...); + +////////////////////////////////////////////////////////////////////////////// +// Audio Encode +////////////////////////////////////////////////////////////////////////////// + +/** + * Audio encode options + */ +typedef struct { + const char* codec; // "mp3", "aac", "opus", "flac", "wav" + uint32_t bitrate_kbps; // Target bitrate for lossy codecs (default: 192) + uint32_t sample_rate; // Output sample rate (0 = match input) + uint32_t channels; // Output channels (0 = match input) + const char* output_path; // Output file path +} imp_audio_encode_opts_t; + +/** + * Opaque audio encoder handle (pointer type) + */ +typedef struct imp_audio_encoder_s imp_audio_encoder_t; + +/** + * Create audio encoder + * + * @param encoder Output: encoder handle pointer + * @param opts Encode options + * @param callback Completion callback (NULL for synchronous close) + * @param user_data User data for callback + * @return Status code + */ +imp_status_t imp_audio_encoder_create(imp_audio_encoder_t** encoder, + const imp_audio_encode_opts_t* opts, + imp_encode_callback_t callback, + void* user_data); + +/** + * Write audio samples to encoder + * + * @param encoder Encoder handle + * @param tensor Audio samples tensor (from imp_decode_audio_file or custom) + * @return Status code + */ +imp_status_t imp_audio_encoder_write(imp_audio_encoder_t* encoder, + imp_tensor_t* tensor); + +/** + * Finalize and close encoder + * + * If callback was provided at create, it will be called when encoding completes. + * + * @param encoder Encoder handle + */ +void imp_audio_encoder_close(imp_audio_encoder_t* encoder); + +/** + * One-shot audio encode to memory buffer. + * + * Encodes raw float PCM samples into the specified codec format and + * returns the encoded bytes. For "wav" the library uses a built-in + * header writer (no GStreamer overhead). For "pcm" the raw float + * bytes are returned as-is. For lossy/lossless codecs ("mp3", + * "flac", "opus", "aac") a GStreamer pipeline is created internally. + * + * The caller must free the returned buffer with imp_free(). + * + * @param data Output: pointer to encoded data (heap-allocated) + * @param data_size Output: size of encoded data in bytes + * @param samples Input float PCM samples (mono, interleaved if stereo) + * @param num_samples Number of float values in @p samples + * @param opts Encode options (codec, sample_rate, channels, bitrate). + * opts->output_path is ignored — output goes to memory. + * @return IMP_OK on success + */ +imp_status_t imp_encode_audio(void** data, + size_t* data_size, + const float* samples, + size_t num_samples, + const imp_audio_encode_opts_t* opts); + +/** + * One-shot audio encode to file. + * + * Same as imp_encode_audio() but writes directly to disk. + * + * @param file_path Output file path + * @param samples Input float PCM samples + * @param num_samples Number of float values + * @param opts Encode options (codec, bitrate, sample_rate, channels). + * opts->output_path is overridden by @p file_path. + * @return IMP_OK on success + */ +imp_status_t imp_encode_audio_file(const char* file_path, + const float* samples, + size_t num_samples, + const imp_audio_encode_opts_t* opts); + +////////////////////////////////////////////////////////////////////////////// +// Tensor Utilities +////////////////////////////////////////////////////////////////////////////// + +/** + * Get tensor's device type + * + * @param tensor Tensor handle + * @param device_type Output: device type (CPU, GPU, NPU) + * @return Status code + */ +imp_status_t imp_tensor_get_device_type(imp_tensor_t* tensor, + imp_device_type_t* device_type); + +/** + * Get tensor's device name (e.g., "GPU.0", "GPU.1", "CPU", "NPU") + * + * @param tensor Tensor handle + * @param device_name Output: device name string + * @return Status code + */ +imp_status_t imp_tensor_get_device_name(imp_tensor_t* tensor, + const char** device_name); + +/** + * Get tensor's context type (for GPU/NPU tensors) + * Returns IMP_ERROR_INVALID_ARGUMENT for CPU tensors. + * + * @param tensor Tensor handle + * @param context_type Output: context type (OpenCL, D3D11, VA-API) + * @return Status code + */ +imp_status_t imp_tensor_get_context_type(imp_tensor_t* tensor, + imp_context_type_t* context_type); + +/** + * Get underlying OpenVINO tensor pointer + * + * @param tensor Tensor handle + * @param ov_tensor Output: pointer to underlying OV tensor (ov::Tensor* or ov::RemoteTensor*) + * @param device_type Output: device type (to know actual type for casting) + * @return Status code + */ +imp_status_t imp_tensor_get_ov(imp_tensor_t* tensor, + void** ov_tensor, + imp_device_type_t* device_type); + +/** + * Get tensor shape + * + * @param tensor Tensor handle + * @param dims Output: dimension array (caller provides) + * @param num_dims Input: array size, Output: actual dimensions + * @return Status code + */ +imp_status_t imp_tensor_get_shape(imp_tensor_t* tensor, + int64_t* dims, + size_t* num_dims); + +/** + * Get tensor element type + * + * @param tensor Tensor handle + * @param type Output: element type + * @return Status code + */ +imp_status_t imp_tensor_get_element_type(imp_tensor_t* tensor, + imp_element_type_t* type); + +/** + * Release tensor + * + * @param tensor Tensor handle + */ +void imp_tensor_release(imp_tensor_t* tensor); + +////////////////////////////////////////////////////////////////////////////// +// Memory Management +////////////////////////////////////////////////////////////////////////////// + +/** + * Free memory allocated by IMP functions + * + * @param ptr Pointer to free + */ +void imp_free(void* ptr); + +////////////////////////////////////////////////////////////////////////////// +// Utility +////////////////////////////////////////////////////////////////////////////// + +/** + * Get API version + * + * @param major Output: major version + * @param minor Output: minor version + * @param patch Output: patch version + */ +void imp_get_version(int* major, int* minor, int* patch); + +/** + * Check hardware decode support + * + * @param ctx Context handle + * @param supported Output: true if HW decode available + * @return Status code + */ +imp_status_t imp_hw_decode_supported(imp_context_t* ctx, bool* supported); + +/** + * Check hardware encode support + * + * @param ctx Context handle + * @param supported Output: true if HW encode available + * @return Status code + */ +imp_status_t imp_hw_encode_supported(imp_context_t* ctx, bool* supported); + +#ifdef __cplusplus +} +#endif + +#endif // INTEL_MPI_H diff --git a/third_party/gstreamer/BUILD b/third_party/gstreamer/BUILD new file mode 100644 index 0000000000..ae706597d0 --- /dev/null +++ b/third_party/gstreamer/BUILD @@ -0,0 +1,2 @@ +# Empty BUILD file — makes this directory a Bazel package so that +# gstreamer_windows.BUILD can be referenced as @//third_party/gstreamer:gstreamer_windows.BUILD diff --git a/third_party/gstreamer/gstreamer_windows.BUILD b/third_party/gstreamer/gstreamer_windows.BUILD new file mode 100644 index 0000000000..6b53b18cac --- /dev/null +++ b/third_party/gstreamer/gstreamer_windows.BUILD @@ -0,0 +1,131 @@ +# +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# GStreamer pre-built import for Windows (msvc_x86_64 installer layout) +# +# Expected install location: C:\Program Files\gstreamer\1.0\msvc_x86_64 +# The new_local_repository in WORKSPACE points "path" there; +# this BUILD file maps headers and import libs for Bazel consumption. + +package(default_visibility = ["//visibility:public"]) + +# ---------- headers ---------- + +cc_library( + name = "gstreamer_headers", + hdrs = glob([ + "include/gstreamer-1.0/**/*.h", + "lib/gstreamer-1.0/include/**/*.h", + ]), + includes = [ + "include/gstreamer-1.0", + "lib/gstreamer-1.0/include", + ], + visibility = ["//visibility:public"], +) + +cc_library( + name = "glib_headers", + hdrs = glob([ + "include/glib-2.0/**/*.h", + "lib/glib-2.0/include/**/*.h", + ]), + includes = [ + "include/glib-2.0", + "lib/glib-2.0/include", + ], + visibility = ["//visibility:public"], +) + +# ---------- import libraries ---------- + +cc_import( + name = "gstreamer_lib", + interface_library = "lib/gstreamer-1.0.lib", + system_provided = True, +) + +cc_import( + name = "gstapp_lib", + interface_library = "lib/gstapp-1.0.lib", + system_provided = True, +) + +cc_import( + name = "gstvideo_lib", + interface_library = "lib/gstvideo-1.0.lib", + system_provided = True, +) + +cc_import( + name = "gstd3d11_lib", + interface_library = "lib/gstd3d11-1.0.lib", + system_provided = True, +) + +cc_import( + name = "gstpbutils_lib", + interface_library = "lib/gstpbutils-1.0.lib", + system_provided = True, +) + +cc_import( + name = "gstbase_lib", + interface_library = "lib/gstbase-1.0.lib", + system_provided = True, +) + +cc_import( + name = "gstaudio_lib", + interface_library = "lib/gstaudio-1.0.lib", + system_provided = True, +) + +cc_import( + name = "glib_lib", + interface_library = "lib/glib-2.0.lib", + system_provided = True, +) + +cc_import( + name = "gobject_lib", + interface_library = "lib/gobject-2.0.lib", + system_provided = True, +) + +# ---------- aggregate target ---------- + +cc_library( + name = "gstreamer", + deps = [ + ":gstreamer_headers", + ":glib_headers", + ":gstreamer_lib", + ":gstapp_lib", + ":gstvideo_lib", + ":gstd3d11_lib", + ":gstpbutils_lib", + ":gstbase_lib", + ":gstaudio_lib", + ":glib_lib", + ":gobject_lib", + ], + linkopts = [ + "d3d11.lib", + "dxgi.lib", + ], + visibility = ["//visibility:public"], +) diff --git a/windows_build.bat b/windows_build.bat index 4c3f100075..c8caa31cfd 100644 --- a/windows_build.bat +++ b/windows_build.bat @@ -41,23 +41,15 @@ IF "%~3"=="--with_tests" ( set "buildTargets=//src:ovms" ) -IF "%~4"=="--integrity" ( - echo Building model server with integrity checks - set "buildWithIntegrity=--config=win_integritycheck" -) ELSE ( - echo Building model server without integrity checks - set "buildWithIntegrity=" -) - set "bazelStartupCmd=--output_user_root=!BAZEL_SHORT_PATH!" set "openvino_dir=!BAZEL_SHORT_PATH!/openvino/runtime/cmake" -set "buildCommand=bazel %bazelStartupCmd% build %buildWithIntegrity% %bazelBuildArgs% --action_env OpenVINO_DIR=%openvino_dir% --jobs=%NUMBER_OF_PROCESSORS% --verbose_failures %buildTargets% 2>&1 | tee win_build.log" +set "buildCommand=bazel %bazelStartupCmd% build %bazelBuildArgs% --action_env OpenVINO_DIR=%openvino_dir% --jobs=4 --verbose_failures %buildTargets% 2>&1 | tee win_build.log" set "setOvmsVersionCmd=python windows_set_ovms_version.py" :: Setting PATH environment variable based on default windows node settings: Added ovms_windows specific python settings and c:/opt and removed unused Nvidia and OCL specific tools. :: When changing the values here you can print the node default PATH value and base your changes on it. -set "setPath=C:\opt;C:\opt\Python312\;C:\opt\Python312\Scripts\;C:\opt\msys64\usr\bin\;%PATH%;" +set "setPath=C:\opt;C:\opt\Python312\;C:\opt\Python312\Scripts\;C:\opt\msys64\usr\bin\;C:\Program Files\gstreamer\1.0\msvc_x86_64\bin;%PATH%;" set "PYTHONHOME=C:\opt\Python312" set "envPath=win_environment.log" set "setPythonPath=%cd%\bazel-out\x64_windows-opt\bin\src\python\binding" diff --git a/windows_test.bat b/windows_test.bat index f00948a18e..ba789bce88 100644 --- a/windows_test.bat +++ b/windows_test.bat @@ -41,7 +41,7 @@ IF "%~3"=="" ( set "gtestFilter=%3" ) -set "buildTestCommand=bazel %bazelStartupCmd% build %bazelBuildArgs% --jobs=%NUMBER_OF_PROCESSORS% --verbose_failures //src:ovms_test" +set "buildTestCommand=bazel %bazelStartupCmd% build %bazelBuildArgs% --jobs=6 --verbose_failures //src:ovms_test" set "changeConfigsCmd=python windows_change_test_configs.py" set "runTest=%cd%\bazel-bin\src\ovms_test.exe --gtest_filter=!gtestFilter! > win_full_test.log 2>&1" From 36d34070f175e10007d959a47f29881b38ad9a10 Mon Sep 17 00:00:00 2001 From: Tobiszewski Date: Thu, 11 Jun 2026 16:40:55 +0200 Subject: [PATCH 2/3] Initial commit --- src/mpi/intel_mpi.cpp | 787 +++++++++++++++++++++++++++++++++++++----- src/mpi/intel_mpi.h | 4 +- 2 files changed, 695 insertions(+), 96 deletions(-) diff --git a/src/mpi/intel_mpi.cpp b/src/mpi/intel_mpi.cpp index 5e4122331f..317fd068f8 100644 --- a/src/mpi/intel_mpi.cpp +++ b/src/mpi/intel_mpi.cpp @@ -140,7 +140,7 @@ imp_status_t imp_video_source_set(imp_video_source_t* source, std::string k(key), v(value); if (k == "path" || k == "url") source->path = v; else if (k == "device") source->device = v; - else if (k == "width") source->width = std::stoi(v); + else if (k == "width") source->width = std::stoi(v); // no error handling FIXME stoi else if (k == "height") source->height = std::stoi(v); else if (k == "framerate") source->framerate = std::stoi(v); else if (k == "format") source->format = v; @@ -295,6 +295,22 @@ imp_status_t imp_video_open(imp_video_stream_t** stream, std::string queue_str = is_file ? queue_file : queue_cam; std::string sink_props = is_file ? appsink_file : appsink_cam; + // Audit mode: force single-buffer, synchronous appsink (disable GStreamer + // pre-buffering to isolate the effect of pipeline parallelism). + if (is_file && opts && opts->sync_appsink) { + queue_str = "queue max-size-buffers=1 max-size-time=0 max-size-bytes=0 leaky=no"; + sink_props = "emit-signals=false sync=false max-buffers=1 drop=false"; + } + + // Audit mode: arbitrary bounded queue depth (overrides sync_appsink). + if (is_file && opts && opts->queue_depth > 0) { + uint32_t d = opts->queue_depth; + queue_str = "queue max-size-buffers=" + std::to_string(d) + + " max-size-time=0 max-size-bytes=0 leaky=no"; + sink_props = "emit-signals=false sync=false max-buffers=" + + std::to_string(d) + " drop=false"; + } + // Source + decode + NV12 convert if (is_file) { pipeline_str = @@ -738,128 +754,459 @@ imp_status_t imp_hw_encode_supported(imp_context_t* ctx, bool* supported) { } ////////////////////////////////////////////////////////////////////////////// -// Image decode / encode — stubs +// Image decode / encode ////////////////////////////////////////////////////////////////////////////// -imp_status_t imp_decode_image(imp_tensor_t** t, const void* d, size_t s, - imp_context_t* c, const imp_image_decode_opts_t* o, - imp_decode_callback_t cb, void* ud) { - (void)t;(void)d;(void)s;(void)c;(void)o;(void)cb;(void)ud; - return IMP_ERROR_INTERNAL; +// Map imp_pixel_format_t → GStreamer video format string +static const char* imp_format_to_gst(imp_pixel_format_t fmt) { + switch (fmt) { + case IMP_FORMAT_RGB: return "RGB"; + case IMP_FORMAT_BGR: return "BGR"; + case IMP_FORMAT_RGBA: return "RGBA"; + case IMP_FORMAT_BGRA: return "BGRA"; + case IMP_FORMAT_NV12: return "NV12"; + case IMP_FORMAT_I420: return "I420"; + case IMP_FORMAT_GRAY: return "GRAY8"; + default: return "RGB"; + } +} + +// Channels for a given pixel format (planar formats return 0) +static int imp_format_channels(imp_pixel_format_t fmt) { + switch (fmt) { + case IMP_FORMAT_RGB: return 3; + case IMP_FORMAT_BGR: return 3; + case IMP_FORMAT_RGBA: return 4; + case IMP_FORMAT_BGRA: return 4; + case IMP_FORMAT_GRAY: return 1; + default: return 0; + } +} + +// Internal: pull decoded image sample from appsink, populate tensor +static imp_status_t imp_image_decode_pull(GstElement* pipeline, + GstElement* appsink, + imp_tensor_t** out_tensor, + imp_pixel_format_t fmt, + imp_layout_t layout, + imp_element_type_t elem_type) { + GstStateChangeReturn ret = gst_element_set_state(pipeline, GST_STATE_PLAYING); + if (ret == GST_STATE_CHANGE_FAILURE) { + gst_element_set_state(pipeline, GST_STATE_NULL); + return IMP_ERROR_DECODE_FAILED; + } + + GstSample* sample = gst_app_sink_try_pull_sample(GST_APP_SINK(appsink), 10 * GST_SECOND); + if (!sample) { + gst_element_set_state(pipeline, GST_STATE_NULL); + return IMP_ERROR_DECODE_FAILED; + } + + GstBuffer* buffer = gst_sample_get_buffer(sample); + GstCaps* caps = gst_sample_get_caps(sample); + + GstVideoInfo info; + gst_video_info_from_caps(&info, caps); + + int w = info.width; + int h = info.height; + + auto* tensor = new imp_tensor_s(); + tensor->width = w; + tensor->height = h; + tensor->format = fmt; + tensor->device_type = IMP_DEVICE_CPU; + tensor->valid = false; + + GstVideoFrame vframe; + if (!gst_video_frame_map(&vframe, &info, buffer, GST_MAP_READ)) { + delete tensor; + gst_sample_unref(sample); + gst_element_set_state(pipeline, GST_STATE_NULL); + return IMP_ERROR_DECODE_FAILED; + } + + if (fmt == IMP_FORMAT_NV12) { + size_t y_size = (size_t)w * h; + size_t uv_size = (size_t)w * (h / 2); + uint8_t* y_buf = (uint8_t*)malloc(y_size); + uint8_t* uv_buf = (uint8_t*)malloc(uv_size); + if (!y_buf || !uv_buf) { + free(y_buf); free(uv_buf); + gst_video_frame_unmap(&vframe); + delete tensor; + gst_sample_unref(sample); + gst_element_set_state(pipeline, GST_STATE_NULL); + return IMP_ERROR_OUT_OF_MEMORY; + } + + uint8_t* y_src = (uint8_t*)GST_VIDEO_FRAME_PLANE_DATA(&vframe, 0); + int y_stride = GST_VIDEO_FRAME_PLANE_STRIDE(&vframe, 0); + for (int row = 0; row < h; row++) + memcpy(y_buf + row * w, y_src + row * y_stride, w); + + uint8_t* uv_src = (uint8_t*)GST_VIDEO_FRAME_PLANE_DATA(&vframe, 1); + int uv_stride = GST_VIDEO_FRAME_PLANE_STRIDE(&vframe, 1); + for (int row = 0; row < h / 2; row++) + memcpy(uv_buf + row * w, uv_src + row * uv_stride, w); + + tensor->y_data = y_buf; + tensor->uv_data = uv_buf; + tensor->valid = true; + } else { + int channels = imp_format_channels(fmt); + if (channels == 0) channels = 3; + + uint8_t* src_data = (uint8_t*)GST_VIDEO_FRAME_PLANE_DATA(&vframe, 0); + int stride = GST_VIDEO_FRAME_PLANE_STRIDE(&vframe, 0); + + if (elem_type == IMP_TYPE_FP32) { + size_t fp32_size = (size_t)w * h * channels; + std::vector fp32_data(fp32_size); + for (int row = 0; row < h; row++) { + uint8_t* row_ptr = src_data + row * stride; + for (int col = 0; col < w * channels; col++) + fp32_data[row * w * channels + col] = row_ptr[col] / 255.0f; + } + + if (layout == IMP_LAYOUT_NCHW) { + ov::Shape shape = {1, (size_t)channels, (size_t)h, (size_t)w}; + tensor->ov_tensor = ov::Tensor(ov::element::f32, shape); + float* dst = tensor->ov_tensor.data(); + for (int c = 0; c < channels; c++) + for (int row = 0; row < h; row++) + for (int col = 0; col < w; col++) + dst[c * h * w + row * w + col] = fp32_data[row * w * channels + col + c]; + } else { + ov::Shape shape = {1, (size_t)h, (size_t)w, (size_t)channels}; + tensor->ov_tensor = ov::Tensor(ov::element::f32, shape); + memcpy(tensor->ov_tensor.data(), fp32_data.data(), fp32_size * sizeof(float)); + } + } else { + if (layout == IMP_LAYOUT_NCHW) { + ov::Shape shape = {1, (size_t)channels, (size_t)h, (size_t)w}; + tensor->ov_tensor = ov::Tensor(ov::element::u8, shape); + uint8_t* dst = tensor->ov_tensor.data(); + for (int c = 0; c < channels; c++) + for (int row = 0; row < h; row++) { + uint8_t* row_ptr = src_data + row * stride; + for (int col = 0; col < w; col++) + dst[c * h * w + row * w + col] = row_ptr[col * channels + c]; + } + } else { + ov::Shape shape = {1, (size_t)h, (size_t)w, (size_t)channels}; + tensor->ov_tensor = ov::Tensor(ov::element::u8, shape); + uint8_t* dst = tensor->ov_tensor.data(); + int row_bytes = w * channels; + for (int row = 0; row < h; row++) + memcpy(dst + row * row_bytes, src_data + row * stride, row_bytes); + } + } + tensor->valid = true; + } + + gst_video_frame_unmap(&vframe); + gst_sample_unref(sample); + gst_element_set_state(pipeline, GST_STATE_NULL); + + *out_tensor = tensor; + return IMP_OK; } imp_status_t imp_decode_image_file(imp_tensor_t** t, const char* f, imp_context_t* c, const imp_image_decode_opts_t* o, imp_decode_callback_t cb, void* ud) { - (void)t;(void)f;(void)c;(void)o;(void)cb;(void)ud; - return IMP_ERROR_INTERNAL; + (void)cb; (void)ud; + + if (!t || !f) return IMP_ERROR_INVALID_ARGUMENT; + + gst_init(nullptr, nullptr); + + imp_pixel_format_t fmt = (o && o->output_format) ? o->output_format : IMP_FORMAT_RGB; + imp_layout_t layout = (o) ? o->output_layout : IMP_LAYOUT_NHWC; + imp_element_type_t etype = (o) ? o->output_type : IMP_TYPE_U8; + uint32_t rw = (o) ? o->resize_width : 0; + uint32_t rh = (o) ? o->resize_height : 0; + + const char* gst_fmt = imp_format_to_gst(fmt); + + // Normalize path (backslashes break gst_parse_launch) + std::string path(f); + for (auto& ch : path) if (ch == '\\') ch = '/'; + + std::string pipe = "filesrc location=\"" + path + "\" ! decodebin ! videoconvert ! " + "video/x-raw,format=" + gst_fmt; + if (rw > 0 && rh > 0) { + pipe += " ! videoscale ! video/x-raw,width=" + std::to_string(rw) + + ",height=" + std::to_string(rh); + } + pipe += " ! appsink name=sink sync=false emit-signals=false"; + + GError* error = nullptr; + GstElement* pipeline = gst_parse_launch(pipe.c_str(), &error); + if (error) { + if (c) c->last_error = error->message; + g_error_free(error); + if (pipeline) gst_object_unref(pipeline); + return IMP_ERROR_DECODE_FAILED; + } + + GstElement* appsink = gst_bin_get_by_name(GST_BIN(pipeline), "sink"); + if (!appsink) { + gst_object_unref(pipeline); + return IMP_ERROR_DECODE_FAILED; + } + + imp_status_t status = imp_image_decode_pull(pipeline, appsink, t, fmt, layout, etype); + + gst_object_unref(appsink); + gst_object_unref(pipeline); + return status; } -imp_status_t imp_decode_audio(imp_tensor_t** tensor, const void* data, size_t size, - imp_context_t* ctx, const imp_audio_decode_opts_t* opts, - imp_decode_callback_t callback, void* user_data) { - (void)callback; (void)user_data; // async not yet implemented - if (!tensor || !data || size == 0) return IMP_ERROR_INVALID_ARGUMENT; +imp_status_t imp_decode_image(imp_tensor_t** t, const void* d, size_t s, + imp_context_t* c, const imp_image_decode_opts_t* o, + imp_decode_callback_t cb, void* ud) { + (void)cb; (void)ud; + + if (!t || !d || s == 0) return IMP_ERROR_INVALID_ARGUMENT; gst_init(nullptr, nullptr); - // Resolve options - uint32_t target_rate = (opts && opts->sample_rate > 0) ? opts->sample_rate : 16000; - uint32_t target_channels = (opts && opts->channels > 0) ? opts->channels : 1; - bool normalize = opts ? opts->normalize : true; - (void)normalize; // GStreamer F32LE is already [-1,1] + imp_pixel_format_t fmt = (o && o->output_format) ? o->output_format : IMP_FORMAT_RGB; + imp_layout_t layout = (o) ? o->output_layout : IMP_LAYOUT_NHWC; + imp_element_type_t etype = (o) ? o->output_type : IMP_TYPE_U8; + uint32_t rw = (o) ? o->resize_width : 0; + uint32_t rh = (o) ? o->resize_height : 0; - // Build pipeline: appsrc → decodebin → audioconvert → audioresample → caps → appsink - std::string capsStr = - "audio/x-raw,format=F32LE,channels=" + std::to_string(target_channels) + - ",rate=" + std::to_string(target_rate); + const char* gst_fmt = imp_format_to_gst(fmt); - std::string pipelineStr = - "appsrc name=src ! decodebin ! audioconvert ! audioresample ! " - + capsStr + " ! appsink name=sink sync=false"; + std::string pipe = "appsrc name=src ! decodebin ! videoconvert ! " + "video/x-raw,format=" + std::string(gst_fmt); + if (rw > 0 && rh > 0) { + pipe += " ! videoscale ! video/x-raw,width=" + std::to_string(rw) + + ",height=" + std::to_string(rh); + } + pipe += " ! appsink name=sink sync=false emit-signals=false"; GError* error = nullptr; - GstElement* pipeline = gst_parse_launch(pipelineStr.c_str(), &error); - if (error || !pipeline) { - if (ctx) ctx->last_error = error ? error->message : "audio pipeline creation failed"; - if (error) g_error_free(error); + GstElement* pipeline = gst_parse_launch(pipe.c_str(), &error); + if (error) { + if (c) c->last_error = error->message; + g_error_free(error); + if (pipeline) gst_object_unref(pipeline); return IMP_ERROR_DECODE_FAILED; } - GstElement* appsrc = gst_bin_get_by_name(GST_BIN(pipeline), "src"); + GstElement* appsrc = gst_bin_get_by_name(GST_BIN(pipeline), "src"); GstElement* appsink = gst_bin_get_by_name(GST_BIN(pipeline), "sink"); if (!appsrc || !appsink) { if (appsrc) gst_object_unref(appsrc); if (appsink) gst_object_unref(appsink); gst_object_unref(pipeline); - return IMP_ERROR_INTERNAL; + return IMP_ERROR_DECODE_FAILED; } - // Allocate GStreamer buffer and copy input data - GstBuffer* buf = gst_buffer_new_allocate(nullptr, size, nullptr); + gst_element_set_state(pipeline, GST_STATE_PLAYING); + + GstBuffer* buf = gst_buffer_new_allocate(nullptr, s, nullptr); GstMapInfo map; if (gst_buffer_map(buf, &map, GST_MAP_WRITE)) { - memcpy(map.data, data, size); + memcpy(map.data, d, s); gst_buffer_unmap(buf, &map); } + gst_app_src_push_buffer(GST_APP_SRC(appsrc), buf); + gst_app_src_end_of_stream(GST_APP_SRC(appsrc)); + + GstSample* sample = gst_app_sink_try_pull_sample(GST_APP_SINK(appsink), 10 * GST_SECOND); + imp_status_t status; + if (!sample) { + status = IMP_ERROR_DECODE_FAILED; + } else { + GstBuffer* sbuf = gst_sample_get_buffer(sample); + GstCaps* caps = gst_sample_get_caps(sample); + GstVideoInfo info; + gst_video_info_from_caps(&info, caps); + + int w = info.width; + int h = info.height; + auto* tensor = new imp_tensor_s(); + tensor->width = w; + tensor->height = h; + tensor->format = fmt; + tensor->device_type = IMP_DEVICE_CPU; + tensor->valid = false; + + GstVideoFrame vframe; + if (gst_video_frame_map(&vframe, &info, sbuf, GST_MAP_READ)) { + if (fmt == IMP_FORMAT_NV12) { + size_t y_size = (size_t)w * h; + size_t uv_size = (size_t)w * (h / 2); + uint8_t* y_buf = (uint8_t*)malloc(y_size); + uint8_t* uv_buf = (uint8_t*)malloc(uv_size); + if (y_buf && uv_buf) { + uint8_t* y_src = (uint8_t*)GST_VIDEO_FRAME_PLANE_DATA(&vframe, 0); + int y_stride = GST_VIDEO_FRAME_PLANE_STRIDE(&vframe, 0); + for (int row = 0; row < h; row++) + memcpy(y_buf + row * w, y_src + row * y_stride, w); + uint8_t* uv_src = (uint8_t*)GST_VIDEO_FRAME_PLANE_DATA(&vframe, 1); + int uv_stride = GST_VIDEO_FRAME_PLANE_STRIDE(&vframe, 1); + for (int row = 0; row < h / 2; row++) + memcpy(uv_buf + row * w, uv_src + row * uv_stride, w); + tensor->y_data = y_buf; + tensor->uv_data = uv_buf; + tensor->valid = true; + } else { + free(y_buf); free(uv_buf); + } + } else { + int channels = imp_format_channels(fmt); + if (channels == 0) channels = 3; + uint8_t* src_data = (uint8_t*)GST_VIDEO_FRAME_PLANE_DATA(&vframe, 0); + int stride = GST_VIDEO_FRAME_PLANE_STRIDE(&vframe, 0); + + if (etype == IMP_TYPE_FP32) { + size_t fp32_count = (size_t)w * h * channels; + std::vector fp32_data(fp32_count); + for (int row = 0; row < h; row++) { + uint8_t* rp = src_data + row * stride; + for (int col = 0; col < w * channels; col++) + fp32_data[row * w * channels + col] = rp[col] / 255.0f; + } + if (layout == IMP_LAYOUT_NCHW) { + ov::Shape shape = {1, (size_t)channels, (size_t)h, (size_t)w}; + tensor->ov_tensor = ov::Tensor(ov::element::f32, shape); + float* dst = tensor->ov_tensor.data(); + for (int ch = 0; ch < channels; ch++) + for (int row = 0; row < h; row++) + for (int col = 0; col < w; col++) + dst[ch * h * w + row * w + col] = fp32_data[row * w * channels + col + ch]; + } else { + ov::Shape shape = {1, (size_t)h, (size_t)w, (size_t)channels}; + tensor->ov_tensor = ov::Tensor(ov::element::f32, shape); + memcpy(tensor->ov_tensor.data(), fp32_data.data(), fp32_count * sizeof(float)); + } + } else { + if (layout == IMP_LAYOUT_NCHW) { + ov::Shape shape = {1, (size_t)channels, (size_t)h, (size_t)w}; + tensor->ov_tensor = ov::Tensor(ov::element::u8, shape); + uint8_t* dst = tensor->ov_tensor.data(); + for (int ch = 0; ch < channels; ch++) + for (int row = 0; row < h; row++) { + uint8_t* rp = src_data + row * stride; + for (int col = 0; col < w; col++) + dst[ch * h * w + row * w + col] = rp[col * channels + ch]; + } + } else { + ov::Shape shape = {1, (size_t)h, (size_t)w, (size_t)channels}; + tensor->ov_tensor = ov::Tensor(ov::element::u8, shape); + uint8_t* dst = tensor->ov_tensor.data(); + int row_bytes = w * channels; + for (int row = 0; row < h; row++) + memcpy(dst + row * row_bytes, src_data + row * stride, row_bytes); + } + } + tensor->valid = true; + } + gst_video_frame_unmap(&vframe); + } + + gst_sample_unref(sample); + if (tensor->valid) { + *t = tensor; + status = IMP_OK; + } else { + delete tensor; + status = IMP_ERROR_DECODE_FAILED; + } + } + + gst_element_set_state(pipeline, GST_STATE_NULL); + gst_object_unref(appsrc); + gst_object_unref(appsink); + gst_object_unref(pipeline); + return status; +} + +imp_status_t imp_decode_audio(imp_tensor_t** t, const void* d, size_t s, + imp_context_t* c, const imp_audio_decode_opts_t* o, + imp_decode_callback_t cb, void* ud) { + (void)c; (void)cb; (void)ud; + if (!t || !d || s == 0 || !o) return IMP_ERROR_INVALID_ARGUMENT; + + gst_init(nullptr, nullptr); + + uint32_t sampleRate = o->sample_rate > 0 ? o->sample_rate : 16000; + uint32_t channels = o->channels > 0 ? o->channels : 1; + + std::string pipeStr = + "appsrc name=src ! decodebin ! audioconvert ! audioresample ! " + "audio/x-raw,format=F32LE,channels=" + std::to_string(channels) + + ",rate=" + std::to_string(sampleRate) + + " ! appsink name=sink"; + + GError* error = nullptr; + GstElement* pipeline = gst_parse_launch(pipeStr.c_str(), &error); + if (error || !pipeline) { + if (error) g_error_free(error); + if (pipeline) gst_object_unref(pipeline); + return IMP_ERROR_DECODE_FAILED; + } + GstElement* appsrc = gst_bin_get_by_name(GST_BIN(pipeline), "src"); + GstElement* appsink = gst_bin_get_by_name(GST_BIN(pipeline), "sink"); + if (!appsrc || !appsink) { + if (appsrc) gst_object_unref(appsrc); + if (appsink) gst_object_unref(appsink); + gst_object_unref(pipeline); + return IMP_ERROR_INTERNAL; + } + + g_object_set(appsrc, "is-live", FALSE, nullptr); gst_element_set_state(pipeline, GST_STATE_PLAYING); - // Push buffer + EOS - gst_app_src_push_buffer(GST_APP_SRC(appsrc), buf); // takes ownership of buf + // Push input buffer + GstBuffer* buffer = gst_buffer_new_allocate(nullptr, s, nullptr); + GstMapInfo map; + gst_buffer_map(buffer, &map, GST_MAP_WRITE); + memcpy(map.data, d, s); + gst_buffer_unmap(buffer, &map); + gst_app_src_push_buffer(GST_APP_SRC(appsrc), buffer); gst_app_src_end_of_stream(GST_APP_SRC(appsrc)); - // Pull all decoded float samples + // Pull decoded samples std::vector samples; - while (true) { - GstSample* sample = gst_app_sink_try_pull_sample(GST_APP_SINK(appsink), 5 * GST_SECOND); - if (!sample) break; - + for (;;) { + GstSample* sample = gst_app_sink_try_pull_sample(GST_APP_SINK(appsink), 10 * GST_SECOND); + if (!sample) { + if (gst_app_sink_is_eos(GST_APP_SINK(appsink))) break; + break; + } GstBuffer* outBuf = gst_sample_get_buffer(sample); GstMapInfo outMap; if (gst_buffer_map(outBuf, &outMap, GST_MAP_READ)) { - size_t numFloats = outMap.size / sizeof(float); - const float* fdata = reinterpret_cast(outMap.data); - samples.insert(samples.end(), fdata, fdata + numFloats); + size_t nFloats = outMap.size / sizeof(float); + const float* fptr = reinterpret_cast(outMap.data); + samples.insert(samples.end(), fptr, fptr + nFloats); gst_buffer_unmap(outBuf, &outMap); } gst_sample_unref(sample); } - // Check for pipeline errors - imp_status_t status = IMP_OK; - GstBus* bus = gst_element_get_bus(pipeline); - GstMessage* msg = gst_bus_pop_filtered(bus, - static_cast(GST_MESSAGE_ERROR)); - if (msg) { - GError* err = nullptr; - gst_message_parse_error(msg, &err, nullptr); - if (ctx && err) ctx->last_error = err->message; - if (err) g_error_free(err); - gst_message_unref(msg); - status = IMP_ERROR_DECODE_FAILED; - } - gst_object_unref(bus); - gst_element_set_state(pipeline, GST_STATE_NULL); gst_object_unref(appsrc); gst_object_unref(appsink); gst_object_unref(pipeline); - if (status != IMP_OK || samples.empty()) { - return status != IMP_OK ? status : IMP_ERROR_DECODE_FAILED; - } - - // Wrap samples in an imp_tensor_t backed by ov::Tensor - auto* t = new imp_tensor_s(); - t->ov_tensor = ov::Tensor(ov::element::f32, {1, samples.size()}); - std::memcpy(t->ov_tensor.data(), samples.data(), samples.size() * sizeof(float)); - t->device_type = IMP_DEVICE_CPU; - t->device_name = "CPU"; - t->format = IMP_FORMAT_GRAY; // 1-D audio — not a pixel format, but marks non-NV12 - t->valid = true; + if (samples.empty()) return IMP_ERROR_DECODE_FAILED; - *tensor = t; + // Wrap in ov::Tensor and return as imp_tensor_t + auto* tensor = new imp_tensor_t(); + tensor->ov_tensor = ov::Tensor(ov::element::f32, {1, samples.size()}); + memcpy(tensor->ov_tensor.data(), samples.data(), samples.size() * sizeof(float)); + tensor->device_type = IMP_DEVICE_CPU; + tensor->valid = true; + *t = tensor; return IMP_OK; } @@ -1147,18 +1494,282 @@ void imp_audio_close(imp_audio_stream_t* stream) { delete stream; } +// Internal: prepare raw pixel buffer from tensor for GStreamer encode. +// Uses GstVideoInfo to compute correct stride-aligned buffer size. +static GstBuffer* imp_image_prepare_encode_buffer(imp_tensor_t* t, + imp_pixel_format_t src_fmt, + int w, int h, int channels, + std::string& caps_str) { + const char* gst_fmt = imp_format_to_gst(src_fmt); + + GstVideoFormat vfmt = gst_video_format_from_string(gst_fmt); + GstVideoInfo vinfo; + gst_video_info_set_format(&vinfo, vfmt, w, h); + size_t data_size = vinfo.size; + + GstBuffer* buffer = gst_buffer_new_allocate(nullptr, data_size, nullptr); + if (!buffer) return nullptr; + + GstVideoFrame vframe; + if (!gst_video_frame_map(&vframe, &vinfo, buffer, GST_MAP_WRITE)) { + gst_buffer_unref(buffer); + return nullptr; + } + + if (src_fmt == IMP_FORMAT_NV12) { + uint8_t* dst_y = (uint8_t*)GST_VIDEO_FRAME_PLANE_DATA(&vframe, 0); + int dst_y_stride = GST_VIDEO_FRAME_PLANE_STRIDE(&vframe, 0); + for (int row = 0; row < h; row++) + memcpy(dst_y + row * dst_y_stride, t->y_data + row * w, w); + uint8_t* dst_uv = (uint8_t*)GST_VIDEO_FRAME_PLANE_DATA(&vframe, 1); + int dst_uv_stride = GST_VIDEO_FRAME_PLANE_STRIDE(&vframe, 1); + for (int row = 0; row < h / 2; row++) + memcpy(dst_uv + row * dst_uv_stride, t->uv_data + row * w, w); + } else if (t->ov_tensor) { + uint8_t* dst = (uint8_t*)GST_VIDEO_FRAME_PLANE_DATA(&vframe, 0); + int dst_stride = GST_VIDEO_FRAME_PLANE_STRIDE(&vframe, 0); + int src_row_bytes = w * channels; + + auto shape = t->ov_tensor.get_shape(); + bool is_nchw = (shape.size() == 4 && shape[1] == (size_t)channels && + shape[2] == (size_t)h && shape[3] == (size_t)w); + bool is_fp32 = (t->ov_tensor.get_element_type() == ov::element::f32); + + if (is_nchw) { + if (is_fp32) { + float* src = t->ov_tensor.data(); + for (int row = 0; row < h; row++) + for (int col = 0; col < w; col++) + for (int ch = 0; ch < channels; ch++) + dst[row * dst_stride + col * channels + ch] = + (uint8_t)(std::min(std::max(src[ch * h * w + row * w + col] * 255.0f, 0.0f), 255.0f)); + } else { + uint8_t* src = t->ov_tensor.data(); + for (int row = 0; row < h; row++) + for (int col = 0; col < w; col++) + for (int ch = 0; ch < channels; ch++) + dst[row * dst_stride + col * channels + ch] = + src[ch * h * w + row * w + col]; + } + } else { + if (is_fp32) { + float* src = t->ov_tensor.data(); + for (int row = 0; row < h; row++) + for (int col = 0; col < src_row_bytes; col++) + dst[row * dst_stride + col] = + (uint8_t)(std::min(std::max(src[row * src_row_bytes + col] * 255.0f, 0.0f), 255.0f)); + } else { + uint8_t* src = t->ov_tensor.data(); + for (int row = 0; row < h; row++) + memcpy(dst + row * dst_stride, src + row * src_row_bytes, src_row_bytes); + } + } + } + + gst_video_frame_unmap(&vframe); + + caps_str = "video/x-raw,format=" + std::string(gst_fmt) + + ",width=" + std::to_string(w) + + ",height=" + std::to_string(h) + + ",framerate=1/1"; + + return buffer; +} + imp_status_t imp_encode_image(void** d, size_t* s, imp_tensor_t* t, imp_context_t* c, const imp_image_encode_opts_t* o, imp_encode_callback_t cb, void* ud) { - (void)d;(void)s;(void)t;(void)c;(void)o;(void)cb;(void)ud; - return IMP_ERROR_INTERNAL; + (void)cb; (void)ud; + + if (!d || !s || !t || !t->valid) return IMP_ERROR_INVALID_ARGUMENT; + + gst_init(nullptr, nullptr); + + std::string format_str = (o && o->format) ? o->format : "jpeg"; + std::string encoder_elem; + if (format_str == "jpeg" || format_str == "jpg") encoder_elem = "jpegenc"; + else if (format_str == "png") encoder_elem = "pngenc"; + else return IMP_ERROR_UNSUPPORTED_FORMAT; + + int quality = (o && o->quality > 0) ? o->quality : 90; + + int w = t->width, h = t->height; + imp_pixel_format_t src_fmt = t->format; + int channels = imp_format_channels(src_fmt); + if (channels == 0 && src_fmt != IMP_FORMAT_NV12) channels = 3; + + if ((w == 0 || h == 0) && t->ov_tensor) { + auto shape = t->ov_tensor.get_shape(); + if (shape.size() == 4) { + if (shape[1] <= 4) { h = (int)shape[2]; w = (int)shape[3]; channels = (int)shape[1]; } + else { h = (int)shape[1]; w = (int)shape[2]; channels = (int)shape[3]; } + } + } + if (w == 0 || h == 0) return IMP_ERROR_INVALID_ARGUMENT; + + std::string caps_str; + GstBuffer* buffer = imp_image_prepare_encode_buffer(t, src_fmt, w, h, channels, caps_str); + if (!buffer) return IMP_ERROR_OUT_OF_MEMORY; + + GstElement* pipeline = gst_pipeline_new("img-enc"); + GstElement* appsrc = gst_element_factory_make("appsrc", "src"); + GstElement* convert = gst_element_factory_make("videoconvert", "conv"); + GstElement* encoder = gst_element_factory_make(encoder_elem.c_str(), "enc"); + GstElement* appsink = gst_element_factory_make("appsink", "sink"); + + if (!pipeline || !appsrc || !convert || !encoder || !appsink) { + gst_buffer_unref(buffer); + if (pipeline) gst_object_unref(pipeline); + return IMP_ERROR_ENCODE_FAILED; + } + + GstCaps* src_caps = gst_caps_from_string(caps_str.c_str()); + g_object_set(appsrc, "caps", src_caps, "format", GST_FORMAT_TIME, + "stream-type", 0, "is-live", FALSE, nullptr); + gst_caps_unref(src_caps); + + if (encoder_elem == "jpegenc") + g_object_set(encoder, "quality", quality, nullptr); + g_object_set(appsink, "sync", FALSE, "emit-signals", FALSE, nullptr); + + gst_bin_add_many(GST_BIN(pipeline), appsrc, convert, encoder, appsink, nullptr); + gst_element_link_many(appsrc, convert, encoder, appsink, nullptr); + + GST_BUFFER_PTS(buffer) = 0; + GST_BUFFER_DURATION(buffer) = GST_CLOCK_TIME_NONE; + + gst_element_set_state(pipeline, GST_STATE_PLAYING); + gst_app_src_push_buffer(GST_APP_SRC(appsrc), buffer); + gst_app_src_end_of_stream(GST_APP_SRC(appsrc)); + + GstSample* sample = gst_app_sink_try_pull_sample(GST_APP_SINK(appsink), 10 * GST_SECOND); + imp_status_t status; + if (!sample) { + status = IMP_ERROR_ENCODE_FAILED; + } else { + GstBuffer* enc_buf = gst_sample_get_buffer(sample); + GstMapInfo map; + if (gst_buffer_map(enc_buf, &map, GST_MAP_READ)) { + void* out = malloc(map.size); + if (out) { + memcpy(out, map.data, map.size); + *d = out; + *s = map.size; + status = IMP_OK; + } else { + status = IMP_ERROR_OUT_OF_MEMORY; + } + gst_buffer_unmap(enc_buf, &map); + } else { + status = IMP_ERROR_ENCODE_FAILED; + } + gst_sample_unref(sample); + } + + gst_element_set_state(pipeline, GST_STATE_NULL); + gst_object_unref(pipeline); + return status; } imp_status_t imp_encode_image_file(const char* f, imp_tensor_t* t, imp_context_t* c, const imp_image_encode_opts_t* o, imp_encode_callback_t cb, void* ud) { - (void)f;(void)t;(void)c;(void)o;(void)cb;(void)ud; - return IMP_ERROR_INTERNAL; + (void)cb; (void)ud; + + if (!f || !t || !t->valid) return IMP_ERROR_INVALID_ARGUMENT; + + gst_init(nullptr, nullptr); + + std::string format_str; + if (o && o->format) { + format_str = o->format; + } else { + std::string path(f); + auto dot = path.rfind('.'); + if (dot != std::string::npos) { + std::string ext = path.substr(dot + 1); + std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower); + if (ext == "jpg" || ext == "jpeg") format_str = "jpeg"; + else if (ext == "png") format_str = "png"; + } + } + if (format_str.empty()) format_str = "jpeg"; + + std::string encoder_elem; + if (format_str == "jpeg" || format_str == "jpg") encoder_elem = "jpegenc"; + else if (format_str == "png") encoder_elem = "pngenc"; + else return IMP_ERROR_UNSUPPORTED_FORMAT; + + int quality = (o && o->quality > 0) ? o->quality : 90; + + int w = t->width, h = t->height; + imp_pixel_format_t src_fmt = t->format; + int channels = imp_format_channels(src_fmt); + if (channels == 0 && src_fmt != IMP_FORMAT_NV12) channels = 3; + + if ((w == 0 || h == 0) && t->ov_tensor) { + auto shape = t->ov_tensor.get_shape(); + if (shape.size() == 4) { + if (shape[1] <= 4) { h = (int)shape[2]; w = (int)shape[3]; channels = (int)shape[1]; } + else { h = (int)shape[1]; w = (int)shape[2]; channels = (int)shape[3]; } + } + } + if (w == 0 || h == 0) return IMP_ERROR_INVALID_ARGUMENT; + + std::string caps_str; + GstBuffer* buffer = imp_image_prepare_encode_buffer(t, src_fmt, w, h, channels, caps_str); + if (!buffer) return IMP_ERROR_OUT_OF_MEMORY; + + std::string out_path(f); + for (auto& ch : out_path) if (ch == '\\') ch = '/'; + + GstElement* pipeline = gst_pipeline_new("img-enc-file"); + GstElement* appsrc = gst_element_factory_make("appsrc", "src"); + GstElement* convert = gst_element_factory_make("videoconvert", "conv"); + GstElement* encoder = gst_element_factory_make(encoder_elem.c_str(), "enc"); + GstElement* filesink = gst_element_factory_make("filesink", "sink"); + + if (!pipeline || !appsrc || !convert || !encoder || !filesink) { + gst_buffer_unref(buffer); + if (pipeline) gst_object_unref(pipeline); + return IMP_ERROR_ENCODE_FAILED; + } + + GstCaps* src_caps = gst_caps_from_string(caps_str.c_str()); + g_object_set(appsrc, "caps", src_caps, "format", GST_FORMAT_TIME, + "stream-type", 0, "is-live", FALSE, nullptr); + gst_caps_unref(src_caps); + + if (encoder_elem == "jpegenc") + g_object_set(encoder, "quality", quality, nullptr); + g_object_set(filesink, "location", out_path.c_str(), nullptr); + + gst_bin_add_many(GST_BIN(pipeline), appsrc, convert, encoder, filesink, nullptr); + gst_element_link_many(appsrc, convert, encoder, filesink, nullptr); + + GST_BUFFER_PTS(buffer) = 0; + GST_BUFFER_DURATION(buffer) = GST_CLOCK_TIME_NONE; + + gst_element_set_state(pipeline, GST_STATE_PLAYING); + gst_app_src_push_buffer(GST_APP_SRC(appsrc), buffer); + gst_app_src_end_of_stream(GST_APP_SRC(appsrc)); + + GstBus* bus = gst_element_get_bus(pipeline); + imp_status_t status = IMP_ERROR_ENCODE_FAILED; + if (bus) { + GstMessage* msg = gst_bus_timed_pop_filtered(bus, 10 * GST_SECOND, + (GstMessageType)(GST_MESSAGE_EOS | GST_MESSAGE_ERROR)); + if (msg) { + if (GST_MESSAGE_TYPE(msg) == GST_MESSAGE_EOS) + status = IMP_OK; + gst_message_unref(msg); + } + gst_object_unref(bus); + } + + gst_element_set_state(pipeline, GST_STATE_NULL); + gst_object_unref(pipeline); + return status; } ////////////////////////////////////////////////////////////////////////////// @@ -1336,7 +1947,6 @@ void imp_audio_encoder_close(imp_audio_encoder_t* encoder) { // One-shot audio encode (to memory) // --------------------------------------------------------------------------- -// Internal: write a WAV file into a malloc'd buffer (no GStreamer) static imp_status_t imp_encode_audio_wav(void** data, size_t* data_size, const float* samples, size_t num_samples, uint32_t sampleRate, uint32_t channels) { @@ -1381,13 +1991,10 @@ imp_status_t imp_encode_audio(void** data, uint32_t sampleRate = opts->sample_rate > 0 ? opts->sample_rate : 16000; uint32_t channels = opts->channels > 0 ? opts->channels : 1; - // ----- WAV: built-in writer (fast, no GStreamer) ----- if (codec == "wav") { - return imp_encode_audio_wav(data, data_size, samples, num_samples, - sampleRate, channels); + return imp_encode_audio_wav(data, data_size, samples, num_samples, sampleRate, channels); } - // ----- PCM: raw float bytes ----- if (codec == "pcm") { size_t sz = num_samples * sizeof(float); *data = malloc(sz); @@ -1397,7 +2004,7 @@ imp_status_t imp_encode_audio(void** data, return IMP_OK; } - // ----- Lossy / lossless codecs via GStreamer ----- + // Lossy / lossless codecs via GStreamer gst_init(nullptr, nullptr); std::string encElement; @@ -1409,15 +2016,13 @@ imp_status_t imp_encode_audio(void** data, } else if (codec == "flac") { encElement = "flacenc"; } else if (codec == "opus") { - // Opus standard sample rates: 8k, 12k, 16k, 24k, 48k uint32_t opusRate = (sampleRate <= 8000) ? 8000 : (sampleRate <= 12000) ? 12000 : (sampleRate <= 16000) ? 16000 : (sampleRate <= 24000) ? 24000 : 48000; encElement = "opusenc bitrate=" + std::to_string(bitrate * 1000); - // audioresample will convert to opusRate automatically via caps muxElement = "oggmux"; - sampleRate = opusRate; // override for caps + sampleRate = opusRate; } else if (codec == "aac") { encElement = "avenc_aac bitrate=" + std::to_string(bitrate * 1000); muxElement = "aacparse ! adtsmux"; @@ -1451,7 +2056,6 @@ imp_status_t imp_encode_audio(void** data, return IMP_ERROR_INTERNAL; } - // Configure appsrc for F32LE input GstCaps* caps = gst_caps_new_simple("audio/x-raw", "format", G_TYPE_STRING, "F32LE", "rate", G_TYPE_INT, (int)(opts->sample_rate > 0 ? opts->sample_rate : 16000), @@ -1463,29 +2067,23 @@ imp_status_t imp_encode_audio(void** data, gst_element_set_state(pipeline, GST_STATE_PLAYING); - // Push float samples size_t byteSize = num_samples * sizeof(float); GstBuffer* buffer = gst_buffer_new_allocate(nullptr, byteSize, nullptr); GstMapInfo map; gst_buffer_map(buffer, &map, GST_MAP_WRITE); memcpy(map.data, samples, byteSize); gst_buffer_unmap(buffer, &map); - GST_BUFFER_PTS(buffer) = 0; GST_BUFFER_DURATION(buffer) = gst_util_uint64_scale( num_samples / channels, GST_SECOND, opts->sample_rate > 0 ? opts->sample_rate : 16000); - - gst_app_src_push_buffer(GST_APP_SRC(appsrc), buffer); // takes ownership + gst_app_src_push_buffer(GST_APP_SRC(appsrc), buffer); gst_app_src_end_of_stream(GST_APP_SRC(appsrc)); - // Collect encoded output from appsink std::vector encoded; for (;;) { GstSample* sample = gst_app_sink_try_pull_sample(GST_APP_SINK(appsink), 10 * GST_SECOND); if (!sample) { - // Check if EOS was reached (normal completion) if (gst_app_sink_is_eos(GST_APP_SINK(appsink))) break; - // Timeout — abort break; } GstBuffer* outBuf = gst_sample_get_buffer(sample); @@ -1515,7 +2113,6 @@ imp_status_t imp_encode_audio_file(const char* file_path, const float* samples, size_t num_samples, const imp_audio_encode_opts_t* opts) { - // Encode to memory, then write to file void* data = nullptr; size_t data_size = 0; imp_status_t st = imp_encode_audio(&data, &data_size, samples, num_samples, opts); diff --git a/src/mpi/intel_mpi.h b/src/mpi/intel_mpi.h index 168d1562aa..3de44b21e3 100644 --- a/src/mpi/intel_mpi.h +++ b/src/mpi/intel_mpi.h @@ -323,6 +323,8 @@ typedef struct { int64_t timeout_ms; // Read timeout, -1 = infinite const imp_video_branch_t* branches; // Array of output branches (NULL = single branch at source res) // TODO FIXME do we need more than one? uint32_t branch_count; // Number of branches (0 = single branch at source res) + bool sync_appsink; // If true (file mode): force single-buffer queue+appsink (disable pre-buffering) + uint32_t queue_depth; // If nonzero (file mode): bound queue + appsink max-buffers to N (overrides sync_appsink) } imp_video_decode_opts_t; /** @@ -862,7 +864,7 @@ void imp_audio_encoder_close(imp_audio_encoder_t* encoder); * @param samples Input float PCM samples (mono, interleaved if stereo) * @param num_samples Number of float values in @p samples * @param opts Encode options (codec, sample_rate, channels, bitrate). - * opts->output_path is ignored — output goes to memory. + * opts->output_path is ignored - output goes to memory. * @return IMP_OK on success */ imp_status_t imp_encode_audio(void** data, From f14a95d3b112730106ad3968e164ce305875db8f Mon Sep 17 00:00:00 2001 From: Adrian Tobiszewski Date: Fri, 19 Jun 2026 17:15:26 +0200 Subject: [PATCH 3/3] WIP Linux/Win gstreamer build --- .gitignore | 1 + Dockerfile.ubuntu | 101 ++++ WORKSPACE | 16 + src/mpi/BUILD | 31 +- src/mpi/gst_loader.cpp | 492 ++++++++++++++++++ src/mpi/gst_loader.h | 105 ++++ src/mpi/imp_mpi_impl.h | 25 +- src/mpi/intel_mpi.cpp | 243 ++++++++- src/mpi/intel_mpi.h | 16 + src/test/mediapipe/calculators/BUILD | 30 ++ .../video_decode_infer_calculator.cc | 266 ++++++++++ .../calculators/video_decode_infer_test.cpp | 202 +++++++ third_party/BUILD | 9 + third_party/gstreamer/gstreamer_linux.BUILD | 53 ++ 14 files changed, 1562 insertions(+), 28 deletions(-) create mode 100644 src/mpi/gst_loader.cpp create mode 100644 src/mpi/gst_loader.h create mode 100644 src/test/mediapipe/calculators/video_decode_infer_calculator.cc create mode 100644 src/test/mediapipe/calculators/video_decode_infer_test.cpp create mode 100644 third_party/gstreamer/gstreamer_linux.BUILD diff --git a/.gitignore b/.gitignore index c404eb8cb8..5161fb94f0 100644 --- a/.gitignore +++ b/.gitignore @@ -28,6 +28,7 @@ cppclean_test tags .venv-style/ src/test/llm_testing +gstreamer_task/ node_modules/ yarn.* out diff --git a/Dockerfile.ubuntu b/Dockerfile.ubuntu index 7f480d15a4..8ff64ae74a 100644 --- a/Dockerfile.ubuntu +++ b/Dockerfile.ubuntu @@ -88,6 +88,102 @@ ARG OPENCV_VERSION=4.13.0 RUN OPENCV_VERSION=${OPENCV_VERSION} ./install_opencv.sh ####### End of OpenCV +################### GSTREAMER BUILD ########################## +# Build GStreamer from source so we control exact version and plugin set. +# Legal: gpl=disabled, ugly=disabled — no GPL/patent-risky plugins. +# Installed to /opt/gstreamer (self-contained, includes glib headers). +FROM base_build as gstreamer-builder +ARG GST_VERSION=1.26.2 +ARG JOBS=8 +ENV DEBIAN_FRONTEND=noninteractive +SHELL ["/bin/bash", "-xo", "pipefail", "-c"] + +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3-pip \ + python3-venv \ + flex \ + bison \ + libglib2.0-dev \ + libmount-dev \ + libpcre2-dev \ + libffi-dev \ + libssl-dev \ + libdrm-dev \ + nasm \ + yasm \ + libavcodec-dev \ + libavformat-dev \ + libavutil-dev \ + libswscale-dev \ + git \ + && apt-get clean && rm -rf /var/lib/apt/lists/* + +RUN python3 -m venv /build_venv && \ + /build_venv/bin/pip install --no-cache-dir meson==1.4.1 ninja==1.11.1.1 + +ENV PATH="/build_venv/bin:${PATH}" + +WORKDIR /src +RUN git clone --depth 1 --branch ${GST_VERSION} \ + https://gitlab.freedesktop.org/gstreamer/gstreamer.git + +WORKDIR /src/gstreamer + +# Install prefix: /opt/gstreamer +RUN meson setup \ + -Dexamples=disabled \ + -Dtests=disabled \ + -Dgst-examples=disabled \ + -Ddevtools=disabled \ + -Dorc=disabled \ + -Dgpl=disabled \ + -Dpython=disabled \ + -Dlibnice=disabled \ + -Dugly=disabled \ + -Dgst-plugins-bad:gpl=disabled \ + -Dgst-plugins-bad:va=disabled \ + -Dgst-plugins-bad:doc=disabled \ + -Dgst-plugins-bad:nls=disabled \ + -Dgst-plugins-bad:directfb=disabled \ + -Dgst-plugins-bad:openni2=disabled \ + -Dgst-plugins-bad:fdkaac=disabled \ + -Dgst-plugins-bad:ladspa=disabled \ + -Dgst-plugins-bad:assrender=disabled \ + -Dgst-plugins-bad:bs2b=disabled \ + -Dgst-plugins-bad:flite=disabled \ + -Dgst-plugins-bad:rtmp=disabled \ + -Dgst-plugins-bad:opencv=disabled \ + -Dgst-plugins-bad:sbc=disabled \ + -Dgst-plugins-bad:teletext=disabled \ + -Dgst-plugins-bad:x265=disabled \ + -Dgst-plugins-bad:webrtcdsp=disabled \ + -Dgst-plugins-bad:dash=disabled \ + -Dgst-plugins-bad:openjpeg=disabled \ + -Dgst-plugins-bad:soundtouch=disabled \ + -Dgst-plugins-bad:isac=disabled \ + -Dgst-plugins-base:nls=disabled \ + -Dgst-plugins-base:gl=disabled \ + -Dgst-plugins-base:pango=disabled \ + -Dgst-plugins-base:xvideo=disabled \ + -Dgst-plugins-good:nls=disabled \ + -Dgst-plugins-good:libcaca=disabled \ + -Dgst-plugins-good:lame=disabled \ + -Dgst-plugins-good:flac=disabled \ + -Dgst-plugins-good:dv=disabled \ + -Dgst-plugins-good:adaptivedemux2=disabled \ + --buildtype=release \ + --prefix=/opt/gstreamer \ + --libdir=lib \ + --libexecdir=bin \ + build/ + +RUN ninja -C build/ -j${JOBS} && meson install -C build/ + +# Copy glib/glib-2.0 headers into the prefix so it is self-contained. +# glib is a build dependency; its headers live in the system include path. +RUN cp -r /usr/include/glib-2.0 /opt/gstreamer/include/glib-2.0 && \ + cp -r /usr/lib/x86_64-linux-gnu/glib-2.0/include/. /opt/gstreamer/include/glib-2.0/ + ################### BASE BUILD ########################## FROM base_build as build ARG BASE_IMAGE @@ -125,6 +221,11 @@ RUN apt-get update && apt-get install --no-install-recommends -y \ vim && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* + +# GStreamer built from source — bring in the self-contained prefix. +# Runtime dlopen picks up /opt/gstreamer/lib at runtime via LD_LIBRARY_PATH or ldconfig. +COPY --from=gstreamer-builder /opt/gstreamer /opt/gstreamer +RUN echo /opt/gstreamer/lib > /etc/ld.so.conf.d/gstreamer.conf && ldconfig # on ubuntu 24.04 python3.12 is used as default python for ovms build and release # TF build needs python3.10 with numpy as it does not support python3.12 RUN python3.10 -m pip install "numpy<2.0.0" --no-cache-dir diff --git a/WORKSPACE b/WORKSPACE index 49413cd595..c3966202c3 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -122,6 +122,22 @@ new_local_repository( build_file = "@//third_party/boringssl:BUILD", ) +# GStreamer — built from source in gstreamer-builder Docker stage. +# Installed to /opt/gstreamer (self-contained: GStreamer + glib headers). +# Linux: dlopen at runtime (no DT_NEEDED), headers-only for compile. +# Windows: pre-built installer from gstreamer.freedesktop.org/data/pkg/windows/. +new_local_repository( + name = "linux_gstreamer", + path = "/opt/gstreamer", + build_file = "@//third_party/gstreamer:gstreamer_linux.BUILD", +) + +new_local_repository( + name = "windows_gstreamer", + path = "C:\\gstreamer\\1.0\\msvc_x86_64", + build_file = "@//third_party/gstreamer:gstreamer_windows.BUILD", +) + new_local_repository( name = "linux_curl", path = "/usr/", diff --git a/src/mpi/BUILD b/src/mpi/BUILD index 3239c5f622..b7ab637873 100644 --- a/src/mpi/BUILD +++ b/src/mpi/BUILD @@ -19,24 +19,26 @@ load("//:common_settings.bzl", "COMMON_LOCAL_DEFINES", "ovms_cc_library") package(default_visibility = ["//visibility:public"]) # Intel Media Processing Interface (Intel MPI) library -# Windows-only for now — wraps GStreamer for HW-accelerated media decode/encode -# with zero-copy OpenVINO tensor integration. -# -# On Linux this target is a no-op (empty lib) so dependents can -# unconditionally list it without breaking the build. +# Windows: direct GStreamer linking with D3D11 HW pipeline. +# Linux: GStreamer loaded at runtime via dlopen (gst_loader.cpp). +# CPU decode only; VA-API GPU path deferred. cc_library( name = "intel_mpi", srcs = select({ "//src:windows": ["intel_mpi.cpp"], - "//conditions:default": [], + "//conditions:default": ["intel_mpi.cpp", "gst_loader.cpp"], }), hdrs = select({ "//src:windows": [ "intel_mpi.h", "imp_mpi_impl.h", ], - "//conditions:default": ["intel_mpi.h"], + "//conditions:default": [ + "intel_mpi.h", + "imp_mpi_impl.h", + "gst_loader.h", + ], }), copts = select({ "//src:windows": [ @@ -52,18 +54,27 @@ cc_library( "/wd4244", "/wd4996", ], - "//conditions:default": [], + "//conditions:default": [ + "-std=c++17", + ], }), local_defines = COMMON_LOCAL_DEFINES + select({ "//src:windows": ["INTEL_MPI_AVAILABLE=1"], - "//conditions:default": [], + "//conditions:default": ["INTEL_MPI_AVAILABLE=1"], + }), + linkopts = select({ + "//src:windows": [], + "//conditions:default": ["-ldl"], }), deps = select({ "//src:windows": [ "//third_party:openvino", "//third_party:gstreamer", ], - "//conditions:default": [], + "//conditions:default": [ + "//third_party:openvino", + "//third_party:gstreamer", + ], }), visibility = ["//visibility:public"], ) diff --git a/src/mpi/gst_loader.cpp b/src/mpi/gst_loader.cpp new file mode 100644 index 0000000000..a6ad852c96 --- /dev/null +++ b/src/mpi/gst_loader.cpp @@ -0,0 +1,492 @@ +// Copyright (c) 2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * GStreamer runtime loader — Linux implementation. + * + * This is the ONLY translation unit that may include GStreamer headers. + * All gst_* calls go through function pointers resolved via dlopen so + * that the binary has no DT_NEEDED entry for GStreamer and can start on + * systems where GStreamer is not installed. + * + * TODO (Phase 2): verify graceful degradation in release image without + * GStreamer runtime packages. + */ + +#ifndef _WIN32 + +// ---- GStreamer headers (compile-time type definitions and macros) ---------- +// These are only needed here because we use GstVideoFrame / GstVideoInfo by +// value (for GST_VIDEO_FRAME_PLANE_DATA / _STRIDE macros which are struct +// field accesses, not function calls). +#include +#include +#include + +// ---- standard headers ------------------------------------------------------- +#include +#include +#include +#include +#include +#include +#include +#include + +// ---- Intel MPI public API --------------------------------------------------- +#include "intel_mpi.h" + +// ---- Internal structs (needed for branch / stream layout) ------------------ +// We include only what's needed for the Linux video path; Windows-only +// sections (D3D11, etc.) are guarded in imp_mpi_impl.h. +#include "imp_mpi_impl.h" + +namespace chrono = std::chrono; + +// ============================================================================ +// Function pointer table +// ============================================================================ + +struct GstFunctionTable { + // gstreamer-1.0 + void (*gst_init)(int*, char***); + GstElement* (*gst_parse_launch)(const gchar*, GError**); + GstStateChangeReturn (*gst_element_set_state)(GstElement*, GstState); + GstStateChangeReturn (*gst_element_get_state)(GstElement*, GstState*, + GstState*, GstClockTime); + GstElement* (*gst_bin_get_by_name)(GstBin*, const gchar*); + GstPad* (*gst_element_get_static_pad)(GstElement*, const gchar*); + GstCaps* (*gst_pad_get_current_caps)(GstPad*); + GstStructure* (*gst_caps_get_structure)(const GstCaps*, guint); + gboolean (*gst_structure_get_int)(const GstStructure*, const gchar*, gint*); + void (*gst_caps_unref)(GstCaps*); + void (*gst_object_unref)(gpointer); + GstBus* (*gst_element_get_bus)(GstElement*); + gboolean (*gst_element_send_event)(GstElement*, GstEvent*); + GstEvent* (*gst_event_new_eos)(void); + GstMessage* (*gst_bus_timed_pop_filtered)(GstBus*, GstClockTime, + GstMessageType); + void (*gst_message_unref)(GstMessage*); + + // gstapp-1.0 + GstSample* (*gst_app_sink_try_pull_sample)(GstAppSink*, GstClockTime); + GstBuffer* (*gst_sample_get_buffer)(GstSample*); + GstCaps* (*gst_sample_get_caps)(GstSample*); + void (*gst_sample_unref)(GstSample*); + + // gstvideo-1.0 + gboolean (*gst_video_info_from_caps)(GstVideoInfo*, const GstCaps*); + gboolean (*gst_video_frame_map)(GstVideoFrame*, GstVideoInfo*, + GstBuffer*, GstMapFlags); + void (*gst_video_frame_unmap)(GstVideoFrame*); + + // glib-2.0 + void (*g_error_free)(GError*); +}; + +static GstFunctionTable s_fns = {}; +static bool s_available = false; +static bool s_initialized = false; + +// ============================================================================ +// Symbol loading helpers +// ============================================================================ + +#define LOAD_SYM(handle, sym) \ + do { \ + s_fns.sym = reinterpret_cast( \ + dlsym((handle), #sym)); \ + if (!s_fns.sym) { \ + std::cerr << "[GstLoader] Missing symbol: " #sym "\n"; \ + return false; \ + } \ + } while (0) + +static bool load_symbols(void* gst_h, void* app_h, + void* vid_h, void* glib_h) { + LOAD_SYM(gst_h, gst_init); + LOAD_SYM(gst_h, gst_parse_launch); + LOAD_SYM(gst_h, gst_element_set_state); + LOAD_SYM(gst_h, gst_element_get_state); + LOAD_SYM(gst_h, gst_bin_get_by_name); + LOAD_SYM(gst_h, gst_element_get_static_pad); + LOAD_SYM(gst_h, gst_pad_get_current_caps); + LOAD_SYM(gst_h, gst_caps_get_structure); + LOAD_SYM(gst_h, gst_structure_get_int); + LOAD_SYM(gst_h, gst_caps_unref); + LOAD_SYM(gst_h, gst_object_unref); + LOAD_SYM(gst_h, gst_element_get_bus); + LOAD_SYM(gst_h, gst_element_send_event); + LOAD_SYM(gst_h, gst_event_new_eos); + LOAD_SYM(gst_h, gst_bus_timed_pop_filtered); + LOAD_SYM(gst_h, gst_message_unref); + + LOAD_SYM(app_h, gst_app_sink_try_pull_sample); + LOAD_SYM(app_h, gst_sample_get_buffer); + LOAD_SYM(app_h, gst_sample_get_caps); + LOAD_SYM(app_h, gst_sample_unref); + + LOAD_SYM(vid_h, gst_video_info_from_caps); + LOAD_SYM(vid_h, gst_video_frame_map); + LOAD_SYM(vid_h, gst_video_frame_unmap); + + LOAD_SYM(glib_h, g_error_free); + + return true; +} + +#undef LOAD_SYM + +// ============================================================================ +// Public loader API +// ============================================================================ + +bool gst_loader_init() { + if (s_initialized) return s_available; + s_initialized = true; + + void* gst_h = dlopen("libgstreamer-1.0.so.0", RTLD_LAZY | RTLD_GLOBAL); + void* app_h = dlopen("libgstapp-1.0.so.0", RTLD_LAZY | RTLD_GLOBAL); + void* vid_h = dlopen("libgstvideo-1.0.so.0", RTLD_LAZY | RTLD_GLOBAL); + void* glib_h = dlopen("libglib-2.0.so.0", RTLD_LAZY | RTLD_GLOBAL); + + if (!gst_h || !app_h || !vid_h || !glib_h) { + std::cerr << "[GstLoader] GStreamer not available: " << dlerror() << "\n" + << "[GstLoader] Install gstreamer1.0-* packages to enable " + "media decode.\n"; + return false; + } + + if (!load_symbols(gst_h, app_h, vid_h, glib_h)) { + return false; + } + + s_available = true; + std::cerr << "[GstLoader] GStreamer loaded successfully.\n"; + return true; +} + +bool gst_loader_available() { return s_available; } + +// ============================================================================ +// Internal: read one NV12 frame from an appsink +// ============================================================================ + +static bool read_nv12_from_appsink(GstElement* appsink, + imp_nv12_frame_t* frame, + double& total_ms, + double& pull_ms, + double& copy_ms) { + const auto& f = s_fns; + auto decode_start = chrono::high_resolution_clock::now(); + + auto ps = chrono::high_resolution_clock::now(); + GstSample* sample = f.gst_app_sink_try_pull_sample( + GST_APP_SINK(appsink), GST_SECOND); + if (!sample) return false; + pull_ms += chrono::duration( + chrono::high_resolution_clock::now() - ps).count(); + + GstBuffer* buffer = f.gst_sample_get_buffer(sample); + GstCaps* caps = f.gst_sample_get_caps(sample); + + GstVideoInfo info; + f.gst_video_info_from_caps(&info, caps); + + frame->allocate(info.width, info.height); + frame->valid = false; + + auto cs = chrono::high_resolution_clock::now(); + + GstVideoFrame vframe; + if (f.gst_video_frame_map(&vframe, &info, buffer, GST_MAP_READ)) { + uint8_t* y_data = static_cast( + GST_VIDEO_FRAME_PLANE_DATA(&vframe, 0)); + int y_stride = GST_VIDEO_FRAME_PLANE_STRIDE(&vframe, 0); + + if (y_stride == frame->width) { + memcpy(frame->y_plane.data(), y_data, + frame->width * frame->height); + } else { + for (int row = 0; row < frame->height; row++) + memcpy(frame->y_plane.data() + row * frame->width, + y_data + row * y_stride, frame->width); + } + + uint8_t* uv_data = static_cast( + GST_VIDEO_FRAME_PLANE_DATA(&vframe, 1)); + int uv_stride = GST_VIDEO_FRAME_PLANE_STRIDE(&vframe, 1); + + if (uv_stride == frame->width) { + memcpy(frame->uv_plane.data(), uv_data, + frame->width * (frame->height / 2)); + } else { + for (int row = 0; row < frame->height / 2; row++) + memcpy(frame->uv_plane.data() + row * frame->width, + uv_data + row * uv_stride, frame->width); + } + + frame->valid = true; + f.gst_video_frame_unmap(&vframe); + } + + f.gst_sample_unref(sample); + + copy_ms += chrono::duration( + chrono::high_resolution_clock::now() - cs).count(); + total_ms += chrono::duration( + chrono::high_resolution_clock::now() - decode_start).count(); + + return frame->valid; +} + +// ============================================================================ +// Public: linux_video_open +// ============================================================================ + +imp_status_t linux_video_open(imp_video_stream_t** stream, + imp_video_source_t* source, + imp_context_t* ctx, + const imp_video_decode_opts_t* opts) { + if (!stream || !source || !ctx) return IMP_ERROR_INVALID_ARGUMENT; + + if (!gst_loader_available()) { + ctx->last_error = "GStreamer not available; install gstreamer1.0-* " + "packages and retry."; + return IMP_ERROR_DEVICE_NOT_AVAILABLE; + } + + const auto& f = s_fns; + auto start = chrono::high_resolution_clock::now(); + + f.gst_init(nullptr, nullptr); + + auto* s = new imp_video_stream_s(); + s->timing = &ctx->timing; + + int src_w = (source->width > 0) ? source->width : 1280; + int src_h = (source->height > 0) ? source->height : 720; + + bool is_file = (source->type == IMP_SOURCE_FILE); + + // Build branch list (single branch only for initial Linux impl) + struct resolved_branch { int width; int height; std::string name; }; + std::vector resolved; + + if (opts && opts->branches && opts->branch_count > 0) { + for (uint32_t i = 0; i < opts->branch_count; i++) { + resolved_branch rb; + int bw = static_cast(opts->branches[i].width); + int bh = static_cast(opts->branches[i].height); + if (bw > 0 && bh > 0) { + rb.width = bw; + rb.height = bh; + } else if (ctx->model_w > 0 && ctx->model_h > 0) { + rb.width = ctx->model_w; + rb.height = ctx->model_h; + } else { + rb.width = src_w; + rb.height = src_h; + } + rb.name = opts->branches[i].name + ? opts->branches[i].name + : ("branch_" + std::to_string(i)); + resolved.push_back(rb); + } + } else { + resolved.push_back({src_w, src_h, "default"}); + } + + // appsink properties + std::string sink_props = is_file + ? "emit-signals=false sync=false max-buffers=0 drop=false" + : "emit-signals=false sync=false max-buffers=2 drop=true"; + + if (is_file && opts && opts->sync_appsink) { + sink_props = "emit-signals=false sync=false max-buffers=1 drop=false"; + } + + // Linux CPU decode pipeline: + // filesrc | decodebin | videoconvert | videoscale | caps | appsink + // + // TODO: VA-API GPU path (deferred — hardware not available yet): + // filesrc | decodebin | vapostproc | caps | appsink + std::string pipeline_str; + + if (!is_file) { + // Camera / live source not yet implemented on Linux + ctx->last_error = "Live camera source not yet supported on Linux"; + delete s; + return IMP_ERROR_UNSUPPORTED_FORMAT; + } + + // Normalize path (backslashes break gst_parse_launch) + std::string path = source->path; + std::replace(path.begin(), path.end(), '\\', '/'); + + if (resolved.size() == 1) { + const auto& rb = resolved[0]; + pipeline_str = + "filesrc location=\"" + path + "\" ! " + "decodebin ! " + "videoconvert ! " + "videoscale ! " + "video/x-raw,format=NV12," + "width=" + std::to_string(rb.width) + + ",height=" + std::to_string(rb.height) + " ! " + "appsink name=branch_0 " + sink_props; + } else { + // Multi-branch via tee + pipeline_str = + "filesrc location=\"" + path + "\" ! " + "decodebin ! " + "videoconvert ! tee name=t"; + + for (size_t i = 0; i < resolved.size(); i++) { + const auto& rb = resolved[i]; + std::string sink_name = "branch_" + std::to_string(i); + pipeline_str += + " t. ! queue ! " + "videoscale ! " + "video/x-raw,format=NV12," + "width=" + std::to_string(rb.width) + + ",height=" + std::to_string(rb.height) + " ! " + "appsink name=" + sink_name + " " + sink_props; + } + } + + std::cout << "[linux_video_open] Pipeline: " << pipeline_str << "\n"; + + GError* error = nullptr; + s->pipeline = f.gst_parse_launch(pipeline_str.c_str(), &error); + if (error || !s->pipeline) { + ctx->last_error = error ? error->message : "pipeline creation failed"; + if (error) f.g_error_free(error); + delete s; + return IMP_ERROR_DECODE_FAILED; + } + + s->branches.resize(resolved.size()); + for (size_t i = 0; i < resolved.size(); i++) { + std::string sink_name = "branch_" + std::to_string(i); + s->branches[i].appsink = f.gst_bin_get_by_name( + GST_BIN(s->pipeline), sink_name.c_str()); + if (!s->branches[i].appsink) { + ctx->last_error = "failed to get appsink: " + sink_name; + f.gst_object_unref(s->pipeline); + delete s; + return IMP_ERROR_DECODE_FAILED; + } + s->branches[i].width = resolved[i].width; + s->branches[i].height = resolved[i].height; + s->branches[i].name = resolved[i].name; + s->branches[i].frame.allocate(resolved[i].width, resolved[i].height); + } + + f.gst_element_set_state(s->pipeline, GST_STATE_PLAYING); + GstStateChangeReturn ret = f.gst_element_get_state( + s->pipeline, nullptr, nullptr, 5 * GST_SECOND); + if (ret == GST_STATE_CHANGE_FAILURE) { + ctx->last_error = "pipeline failed to start"; + for (auto& b : s->branches) { + if (b.appsink) f.gst_object_unref(b.appsink); + } + f.gst_object_unref(s->pipeline); + delete s; + return IMP_ERROR_DECODE_FAILED; + } + + // Read negotiated caps from first appsink to get actual dimensions + GstPad* pad = f.gst_element_get_static_pad(s->branches[0].appsink, "sink"); + if (pad) { + GstCaps* caps = f.gst_pad_get_current_caps(pad); + if (caps) { + GstStructure* st = f.gst_caps_get_structure(caps, 0); + int actual_w = 0, actual_h = 0; + f.gst_structure_get_int(st, "width", &actual_w); + f.gst_structure_get_int(st, "height", &actual_h); + if (actual_w > 0) s->branches[0].width = actual_w; + if (actual_h > 0) s->branches[0].height = actual_h; + f.gst_caps_unref(caps); + } + f.gst_object_unref(pad); + } + + s->use_hw_decode = false; // CPU decode + + auto end = chrono::high_resolution_clock::now(); + ctx->timing.video_open_ms = + chrono::duration(end - start).count(); + + for (size_t i = 0; i < s->branches.size(); i++) { + const auto& b = s->branches[i]; + std::cout << "[linux_video_open] Branch " << i + << " [" << b.name << "]: " + << b.width << "x" << b.height << " NV12 (CPU)\n"; + } + + *stream = s; + return IMP_OK; +} + +// ============================================================================ +// Public: linux_video_read_frame +// ============================================================================ + +imp_status_t linux_video_read_frame(imp_tensor_t** tensor, + imp_video_stream_t* stream, + uint32_t branch_index) { + if (!stream || branch_index >= static_cast(stream->branches.size())) + return IMP_ERROR_INVALID_ARGUMENT; + + auto& branch = stream->branches[branch_index]; + + bool ok = read_nv12_from_appsink( + branch.appsink, &branch.frame, + branch.total_decode_ms, + branch.total_decode_pull_ms, + branch.total_decode_copy_ms); + + if (!ok) return IMP_ERROR_STREAM_END; + + branch.tensor_cache.y_data = branch.frame.y_plane.data(); + branch.tensor_cache.uv_data = branch.frame.uv_plane.data(); + branch.tensor_cache.width = branch.frame.width; + branch.tensor_cache.height = branch.frame.height; + branch.tensor_cache.format = IMP_FORMAT_NV12; + branch.tensor_cache.valid = true; + branch.tensor_cache.device_type = IMP_DEVICE_CPU; + + if (tensor) *tensor = &branch.tensor_cache; + return IMP_OK; +} + +// ============================================================================ +// Public: linux_video_close +// ============================================================================ + +void linux_video_close(imp_video_stream_t* stream) { + if (!stream) return; + const auto& f = s_fns; + if (stream->pipeline) { + f.gst_element_set_state(stream->pipeline, GST_STATE_NULL); + for (auto& b : stream->branches) { + if (b.appsink) f.gst_object_unref(b.appsink); + } + f.gst_object_unref(stream->pipeline); + } + delete stream; +} + +#endif // !_WIN32 diff --git a/src/mpi/gst_loader.h b/src/mpi/gst_loader.h new file mode 100644 index 0000000000..2963c19517 --- /dev/null +++ b/src/mpi/gst_loader.h @@ -0,0 +1,105 @@ +// Copyright (c) 2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * GStreamer runtime loader for Linux. + * + * Provides a dlopen-based loading of GStreamer shared libraries so that + * the Intel MPI binary can start even on systems without GStreamer installed. + * All GStreamer operations are only available when gst_loader_available() + * returns true. + * + * Usage: + * gst_loader_init(); // call once; safe to call multiple times + * if (!gst_loader_available()) return IMP_ERROR_DEVICE_NOT_AVAILABLE; + * // use linux_video_open() / linux_video_read_frame() / linux_video_close() + * + * This header intentionally does NOT include any GStreamer headers so that + * files outside of gst_loader.cpp do not transitively link against GStreamer. + * All GStreamer types exposed in the public API are forward-declared below. + */ + +#ifndef GST_LOADER_H +#define GST_LOADER_H + +#ifndef _WIN32 + +#include +#include + +// Forward-declare GStreamer opaque types. Consumers only hold pointers so +// full definitions are not required here. +typedef struct _GstElement GstElement; + +// Intel MPI public API — provides all imp_* types and status codes. +#include "intel_mpi.h" + +/** + * Attempt to dlopen GStreamer shared libraries and resolve all required + * function pointers. Safe to call multiple times (idempotent after first + * call). + * + * @return true if all symbols were resolved and GStreamer is usable. + * @return false if any library or symbol was missing. + */ +bool gst_loader_init(); + +/** + * Returns true after a successful gst_loader_init(). + * Returns false if GStreamer is not installed or init has not been called. + */ +bool gst_loader_available(); + +// --------------------------------------------------------------------------- +// High-level Linux video operations. +// These are the only entry-points needed by intel_mpi.cpp on Linux. +// All GStreamer details are hidden inside gst_loader.cpp. +// --------------------------------------------------------------------------- + +/** + * Open a video file for decode. + * Equivalent to imp_video_open() on Linux. + * + * @param stream output: newly allocated stream handle + * @param source video source configuration (path, dimensions, …) + * @param ctx Intel MPI context (for dimensions hints and error reporting) + * @param opts decode options (branches, sync mode, …) — may be nullptr + * @return IMP_OK on success, error code otherwise. + */ +imp_status_t linux_video_open(imp_video_stream_t** stream, + imp_video_source_t* source, + imp_context_t* ctx, + const imp_video_decode_opts_t* opts); + +/** + * Read one NV12 frame from the given branch of an open stream. + * Equivalent to imp_video_read_frame() on Linux. + * + * @param tensor output: pointer to internal tensor cache (not owned) + * @param stream open stream + * @param branch_index branch to read from (0 for single-branch streams) + * @return IMP_OK, IMP_ERROR_STREAM_END when exhausted, or error code. + */ +imp_status_t linux_video_read_frame(imp_tensor_t** tensor, + imp_video_stream_t* stream, + uint32_t branch_index); + +/** + * Close and free a stream opened with linux_video_open(). + * Equivalent to imp_video_close() on Linux. + */ +void linux_video_close(imp_video_stream_t* stream); + +#endif // !_WIN32 +#endif // GST_LOADER_H diff --git a/src/mpi/imp_mpi_impl.h b/src/mpi/imp_mpi_impl.h index ad90e7f74c..8c2deafcc7 100644 --- a/src/mpi/imp_mpi_impl.h +++ b/src/mpi/imp_mpi_impl.h @@ -4,15 +4,12 @@ * NOT part of the public API. These are the C++ structs behind the * opaque handles declared in intel_mpi.h. * - * Only intel_mpi.cpp should include this file. + * Only intel_mpi.cpp and gst_loader.cpp should include this file. */ #ifndef IMP_MPI_IMPL_H #define IMP_MPI_IMPL_H -#define NOMINMAX -#define WIN32_LEAN_AND_MEAN - #include #include #include @@ -20,7 +17,10 @@ #include #include +#ifdef _WIN32 // Windows + D3D11 +#define NOMINMAX +#define WIN32_LEAN_AND_MEAN #include #include #include @@ -29,7 +29,7 @@ #include #include -// GStreamer +// GStreamer (Windows: direct linking via import libs) #include #include #include @@ -37,6 +37,21 @@ #include #include +#else // Linux + +// OpenVINO C++ API +#include + +// GStreamer types needed for struct members (pointer-only use). +// Full headers are only included in gst_loader.cpp where the dlopen +// wrapper lives. Using forward declarations here keeps intel_mpi.cpp +// free of any direct GStreamer symbol references. +typedef struct _GstElement GstElement; +typedef struct _GstBus GstBus; +typedef struct _GstMessage GstMessage; + +#endif // _WIN32 + // Public API types #include "intel_mpi.h" diff --git a/src/mpi/intel_mpi.cpp b/src/mpi/intel_mpi.cpp index 317fd068f8..173c5aa7b0 100644 --- a/src/mpi/intel_mpi.cpp +++ b/src/mpi/intel_mpi.cpp @@ -11,16 +11,29 @@ * - Encoder = accepts imp_tensor_t* directly * * NO inference logic lives here. Inference helpers belong in the demo/OVMS layer. + * + * Platform notes: + * Windows — direct GStreamer linking via import libs; D3D11 HW pipeline. + * Linux — GStreamer loaded at runtime via dlopen (gst_loader.cpp); + * CPU decode pipeline only (VA-API GPU path deferred). + * Image / audio encode-decode functions are Windows-only for now + * and return IMP_ERROR_INTERNAL on Linux. */ +#ifdef _WIN32 #define NOMINMAX #define WIN32_LEAN_AND_MEAN #ifndef GST_USE_UNSTABLE_API #define GST_USE_UNSTABLE_API #endif +#endif // _WIN32 #include "imp_mpi_impl.h" // same directory +#ifndef _WIN32 +#include "gst_loader.h" +#endif + #include #include #include @@ -45,10 +58,13 @@ imp_status_t imp_context_create(imp_context_t** ctx, ov_compiled_model_t* compiled_model) { if (!ctx) return IMP_ERROR_INVALID_ARGUMENT; - // In this POC, compiled_model is actually a C++ ov::CompiledModel* - // cast through the C opaque pointer. + // compiled_model is a C++ ov::CompiledModel* cast through the C opaque pointer. + // On Linux / CPU-only mode, nullptr is valid — creates a standalone context + // with no model dimension hints. auto* cm = reinterpret_cast(compiled_model); +#ifdef _WIN32 if (!cm) return IMP_ERROR_INVALID_ARGUMENT; +#endif auto start = chrono::high_resolution_clock::now(); @@ -56,17 +72,21 @@ imp_status_t imp_context_create(imp_context_t** ctx, // Extract model input dimensions for branch auto-deduction. // After NV12 PPP the Y input is [1, H, W, 1]. - auto inputs = cm->inputs(); - if (!inputs.empty()) { - auto shape = inputs[0].get_shape(); - if (shape.size() == 4) { - c->model_h = static_cast(shape[1]); - c->model_w = static_cast(shape[2]); + if (cm) { + // Extract model input dimensions for branch auto-deduction. + // After NV12 PPP the Y input is [1, H, W, 1]. + auto inputs = cm->inputs(); + if (!inputs.empty()) { + auto shape = inputs[0].get_shape(); + if (shape.size() == 4) { + c->model_h = static_cast(shape[1]); + c->model_w = static_cast(shape[2]); + } } } - c->device_type = IMP_DEVICE_GPU; - c->device_name = "GPU"; + c->device_type = IMP_DEVICE_CPU; + c->device_name = cm ? "GPU" : "CPU"; c->initialized = true; auto end = chrono::high_resolution_clock::now(); @@ -158,6 +178,12 @@ void imp_video_source_destroy(imp_video_source_t* source) { delete source; } +// ============================================================================ +// Windows-only GStreamer video / audio / image pipeline implementations. +// Linux equivalents live in gst_loader.cpp (dlopen-based). +// ============================================================================ +#ifdef _WIN32 + ////////////////////////////////////////////////////////////////////////////// // Internal: read NV12 from appsink into frame buffer ////////////////////////////////////////////////////////////////////////////// @@ -448,6 +474,18 @@ imp_status_t imp_video_open(imp_video_stream_t** stream, return IMP_OK; } +#else // Linux — delegate to gst_loader.cpp + +imp_status_t imp_video_open(imp_video_stream_t** stream, + imp_video_source_t* source, + imp_context_t* ctx, + const imp_video_decode_opts_t* opts) { + gst_loader_init(); + return linux_video_open(stream, source, ctx, opts); +} + +#endif // _WIN32 + ////////////////////////////////////////////////////////////////////////////// // Video Read Frame — branch-aware ////////////////////////////////////////////////////////////////////////////// @@ -455,8 +493,7 @@ imp_status_t imp_video_open(imp_video_stream_t** stream, imp_status_t imp_video_read_frame(imp_tensor_t** tensor, imp_video_stream_t* stream, uint32_t branch_index) { - if (!stream || branch_index >= (uint32_t)stream->branches.size()) - return IMP_ERROR_INVALID_ARGUMENT; +#ifdef _WIN32 auto& branch = stream->branches[branch_index]; @@ -479,6 +516,10 @@ imp_status_t imp_video_read_frame(imp_tensor_t** tensor, if (tensor) *tensor = &branch.tensor_cache; return IMP_OK; + +#else // Linux + return linux_video_read_frame(tensor, stream, branch_index); +#endif } imp_status_t imp_video_read_frame_by_name(imp_tensor_t** tensor, @@ -518,6 +559,7 @@ imp_status_t imp_video_get_info(imp_video_stream_t* stream, } void imp_video_close(imp_video_stream_t* stream) { +#ifdef _WIN32 if (!stream) return; if (stream->pipeline) { gst_element_set_state(stream->pipeline, GST_STATE_NULL); @@ -527,12 +569,17 @@ void imp_video_close(imp_video_stream_t* stream) { gst_object_unref(stream->pipeline); } delete stream; +#else + linux_video_close(stream); +#endif } ////////////////////////////////////////////////////////////////////////////// -// Video Encoder +// Video Encoder (Windows-only — Linux stub returns IMP_ERROR_INTERNAL) ////////////////////////////////////////////////////////////////////////////// +#ifdef _WIN32 + imp_status_t imp_video_encoder_create(imp_video_encoder_t** encoder, uint32_t width, uint32_t height, imp_context_t* ctx, @@ -662,6 +709,28 @@ void imp_video_encoder_close(imp_video_encoder_t* encoder) { delete encoder; } +#else // Linux stubs for video encoder + +imp_status_t imp_video_encoder_create(imp_video_encoder_t** encoder, + uint32_t width, uint32_t height, + imp_context_t* ctx, + const imp_video_encode_opts_t* opts) { + (void)encoder; (void)width; (void)height; (void)ctx; (void)opts; + return IMP_ERROR_INTERNAL; // Not yet implemented on Linux +} + +imp_status_t imp_video_encoder_write(imp_video_encoder_t* encoder, + imp_tensor_t* tensor) { + (void)encoder; (void)tensor; + return IMP_ERROR_INTERNAL; +} + +void imp_video_encoder_close(imp_video_encoder_t* encoder) { + (void)encoder; +} + +#endif // _WIN32 + ////////////////////////////////////////////////////////////////////////////// // Tensor utilities ////////////////////////////////////////////////////////////////////////////// @@ -728,6 +797,22 @@ imp_status_t imp_tensor_get_element_type(imp_tensor_t* tensor, return IMP_ERROR_INTERNAL; // TODO: map ov::element::Type } +imp_status_t imp_tensor_get_nv12_planes(imp_tensor_t* tensor, + const uint8_t** y_data, + const uint8_t** uv_data, + int* width, + int* height) { + if (!tensor || !y_data || !uv_data || !width || !height) + return IMP_ERROR_INVALID_ARGUMENT; + if (tensor->format != IMP_FORMAT_NV12 || tensor->device_type != IMP_DEVICE_CPU) + return IMP_ERROR_INVALID_ARGUMENT; + *y_data = tensor->y_data; + *uv_data = tensor->uv_data; + *width = tensor->width; + *height = tensor->height; + return IMP_OK; +} + void imp_tensor_release(imp_tensor_t* tensor) { if (!tensor) return; delete tensor; @@ -753,6 +838,14 @@ imp_status_t imp_hw_encode_supported(imp_context_t* ctx, bool* supported) { return IMP_OK; } +// ============================================================================ +// Image decode / encode and audio decode / encode — Windows-only. +// These functions use direct GStreamer calls that are only available on +// Windows (D3D11 pipeline, mfh264enc, etc.). Linux stubs follow at the +// end of the #ifdef _WIN32 / #else block. +// ============================================================================ +#ifdef _WIN32 + ////////////////////////////////////////////////////////////////////////////// // Image decode / encode ////////////////////////////////////////////////////////////////////////////// @@ -2126,3 +2219,127 @@ imp_status_t imp_encode_audio_file(const char* file_path, return IMP_OK; } +#else // Linux stubs for image/audio decode-encode and audio stream/encoder + +imp_status_t imp_decode_image_file(imp_tensor_t** t, const char* f, + imp_context_t* c, + const imp_image_decode_opts_t* o, + imp_decode_callback_t cb, void* ud) { + (void)t; (void)f; (void)c; (void)o; (void)cb; (void)ud; + return IMP_ERROR_INTERNAL; // Not yet implemented on Linux +} + +imp_status_t imp_decode_image(imp_tensor_t** t, const void* d, size_t s, + imp_context_t* c, + const imp_image_decode_opts_t* o, + imp_decode_callback_t cb, void* ud) { + (void)t; (void)d; (void)s; (void)c; (void)o; (void)cb; (void)ud; + return IMP_ERROR_INTERNAL; +} + +imp_status_t imp_decode_audio(imp_tensor_t** t, const void* d, size_t s, + imp_context_t* c, + const imp_audio_decode_opts_t* o, + imp_decode_callback_t cb, void* ud) { + (void)t; (void)d; (void)s; (void)c; (void)o; (void)cb; (void)ud; + return IMP_ERROR_INTERNAL; +} + +imp_status_t imp_decode_audio_file(imp_tensor_t** t, const char* f, + imp_context_t* c, + const imp_audio_decode_opts_t* o, + imp_decode_callback_t cb, void* ud) { + (void)t; (void)f; (void)c; (void)o; (void)cb; (void)ud; + return IMP_ERROR_INTERNAL; +} + +imp_status_t imp_audio_file_info(const char* file_path, + uint32_t* sample_rate, + uint32_t* channels, + double* duration_sec) { + (void)file_path; (void)sample_rate; (void)channels; (void)duration_sec; + return IMP_ERROR_INTERNAL; +} + +imp_status_t imp_audio_open(imp_audio_stream_t** stream, + const char* input_path, + const char* output_path, + const imp_audio_stream_opts_t* opts) { + (void)stream; (void)input_path; (void)output_path; (void)opts; + return IMP_ERROR_INTERNAL; +} + +imp_status_t imp_audio_get_info(imp_audio_stream_t* stream, + imp_audio_info_t* info) { + (void)stream; (void)info; + return IMP_ERROR_INTERNAL; +} + +imp_status_t imp_audio_process(imp_audio_stream_t* stream) { + (void)stream; + return IMP_ERROR_INTERNAL; +} + +imp_status_t imp_audio_get_timing(imp_audio_stream_t* stream, + double* wall_time_sec, + double* realtime_factor) { + (void)stream; (void)wall_time_sec; (void)realtime_factor; + return IMP_ERROR_INTERNAL; +} + +void imp_audio_close(imp_audio_stream_t* stream) { + (void)stream; +} + +imp_status_t imp_encode_image(void** d, size_t* s, imp_tensor_t* t, + imp_context_t* c, + const imp_image_encode_opts_t* o, + imp_encode_callback_t cb, void* ud) { + (void)d; (void)s; (void)t; (void)c; (void)o; (void)cb; (void)ud; + return IMP_ERROR_INTERNAL; +} + +imp_status_t imp_encode_image_file(const char* f, imp_tensor_t* t, + imp_context_t* c, + const imp_image_encode_opts_t* o, + imp_encode_callback_t cb, void* ud) { + (void)f; (void)t; (void)c; (void)o; (void)cb; (void)ud; + return IMP_ERROR_INTERNAL; +} + +imp_status_t imp_audio_encoder_create(imp_audio_encoder_t** encoder, + const imp_audio_encode_opts_t* opts, + imp_encode_callback_t callback, + void* user_data) { + (void)encoder; (void)opts; (void)callback; (void)user_data; + return IMP_ERROR_INTERNAL; +} + +imp_status_t imp_audio_encoder_write(imp_audio_encoder_t* encoder, + imp_tensor_t* tensor) { + (void)encoder; (void)tensor; + return IMP_ERROR_INTERNAL; +} + +void imp_audio_encoder_close(imp_audio_encoder_t* encoder) { + (void)encoder; +} + +imp_status_t imp_encode_audio(void** data, + size_t* data_size, + const float* samples, + size_t num_samples, + const imp_audio_encode_opts_t* opts) { + (void)data; (void)data_size; (void)samples; (void)num_samples; (void)opts; + return IMP_ERROR_INTERNAL; +} + +imp_status_t imp_encode_audio_file(const char* file_path, + const float* samples, + size_t num_samples, + const imp_audio_encode_opts_t* opts) { + (void)file_path; (void)samples; (void)num_samples; (void)opts; + return IMP_ERROR_INTERNAL; +} + +#endif // _WIN32 diff --git a/src/mpi/intel_mpi.h b/src/mpi/intel_mpi.h index 3de44b21e3..4a4786f628 100644 --- a/src/mpi/intel_mpi.h +++ b/src/mpi/intel_mpi.h @@ -959,6 +959,22 @@ imp_status_t imp_tensor_get_shape(imp_tensor_t* tensor, imp_status_t imp_tensor_get_element_type(imp_tensor_t* tensor, imp_element_type_t* type); +/** + * Get NV12 plane pointers and dimensions for CPU-side NV12 tensors. + * + * @param tensor Tensor handle (must be format IMP_FORMAT_NV12, device CPU) + * @param y_data Output: pointer to Y plane (width * height bytes) + * @param uv_data Output: pointer to interleaved UV plane (width * height/2 bytes) + * @param width Output: frame width in pixels + * @param height Output: frame height in pixels + * @return IMP_OK on success, IMP_ERROR_INVALID_ARGUMENT if tensor is not NV12/CPU + */ +imp_status_t imp_tensor_get_nv12_planes(imp_tensor_t* tensor, + const uint8_t** y_data, + const uint8_t** uv_data, + int* width, + int* height); + /** * Release tensor * diff --git a/src/test/mediapipe/calculators/BUILD b/src/test/mediapipe/calculators/BUILD index a62497082b..ea9c9ece4e 100644 --- a/src/test/mediapipe/calculators/BUILD +++ b/src/test/mediapipe/calculators/BUILD @@ -95,3 +95,33 @@ cc_library( linkopts = LINKOPTS_ADJUSTED, ) +# Standalone smoke test: decode video with GStreamer + infer with OV directly +# Run with: +# bazel run //src/test/mediapipe/calculators:video_decode_infer_test \ +# -- /path/to/video.avi /path/to/model.xml +cc_binary( + name = "video_decode_infer_test", + srcs = ["video_decode_infer_test.cpp"], + copts = COPTS_ADJUSTED, + deps = [ + "//src/mpi:intel_mpi", + "//third_party:openvino", + ], + linkopts = LINKOPTS_ADJUSTED, +) + +cc_library( + name = "video_decode_infer_calculator", + linkstatic = 1, + alwayslink = 1, + srcs = ["video_decode_infer_calculator.cc"], + copts = COPTS_ADJUSTED, + visibility = ["//visibility:public"], + deps = [ + "//src/mpi:intel_mpi", + "@mediapipe_calculators//:mediapipe_calculators", + "//third_party:openvino", + ], + linkopts = LINKOPTS_ADJUSTED, +) + diff --git a/src/test/mediapipe/calculators/video_decode_infer_calculator.cc b/src/test/mediapipe/calculators/video_decode_infer_calculator.cc new file mode 100644 index 0000000000..a87c7601e1 --- /dev/null +++ b/src/test/mediapipe/calculators/video_decode_infer_calculator.cc @@ -0,0 +1,266 @@ +// Copyright (c) 2026 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * VideoDecodeInferCalculator — GStreamer integration smoke test. + * + * Reads a video file using Intel MPI (imp_* API), decodes frames to NV12, + * converts to BGR float32, and runs inference with face-detection-adas-0001 + * via OpenVINO directly (not through an OVMS model instance). + * + * face-detection-adas-0001 expects: [1, 3, 384, 672] BGR NCHW FP32 + * + * Side-packet inputs: + * VIDEO_PATH : std::string — path to input video file + * MODEL_PATH : std::string — path to face-detection-adas-0001.xml + * + * Output stream: + * DETECTIONS : int — total detections above threshold across all frames + */ + +#include +#include +#include +#include + +#include + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/port/status.h" +#pragma GCC diagnostic pop + +#include "src/mpi/intel_mpi.h" + +namespace mediapipe { + +// ============================================================================ +// NV12 → BGR float32 conversion (plain C, no opencv dependency) +// +// Output: contiguous BGR row-major buffer [H×W×3] float32, values [0,1] +// ============================================================================ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-function" +static void nv12_to_bgr_float(const uint8_t* y_plane, const uint8_t* uv_plane, + int src_w, int src_h, + int dst_w, int dst_h, + std::vector& out_bgr) { + // Simple nearest-neighbour resize + NV12→BGR + out_bgr.resize(static_cast(dst_h) * dst_w * 3); + + const float scale_x = static_cast(src_w) / dst_w; + const float scale_y = static_cast(src_h) / dst_h; + + for (int dy = 0; dy < dst_h; dy++) { + int sy = static_cast(dy * scale_y); + if (sy >= src_h) sy = src_h - 1; + + int uv_row = (sy / 2) * src_w; // UV plane row offset + + for (int dx = 0; dx < dst_w; dx++) { + int sx = static_cast(dx * scale_x); + if (sx >= src_w) sx = src_w - 1; + + // Y sample + float Y = static_cast(y_plane[sy * src_w + sx]) - 16.0f; + + // UV sample (interleaved: U at even index, V at odd) + int uv_col = (sx / 2) * 2; + float U = static_cast(uv_plane[uv_row + uv_col]) - 128.0f; + float V = static_cast(uv_plane[uv_row + uv_col + 1]) - 128.0f; + + // BT.601 limited range + float r = 1.164f * Y + 1.596f * V; + float g = 1.164f * Y - 0.392f * U - 0.813f * V; + float b = 1.164f * Y + 2.017f * U; + + // Clamp and normalize to [0,1] + auto clamp01 = [](float v) { return v < 0.0f ? 0.0f : (v > 255.0f ? 1.0f : v / 255.0f); }; + + size_t idx = static_cast((dy * dst_w + dx) * 3); + out_bgr[idx + 0] = clamp01(b); + out_bgr[idx + 1] = clamp01(g); + out_bgr[idx + 2] = clamp01(r); + } + } +} +#pragma GCC diagnostic pop + +// ============================================================================ +// Calculator +// ============================================================================ + +class VideoDecodeInferCalculator : public CalculatorBase { + public: + static absl::Status GetContract(CalculatorContract* cc) { + cc->InputSidePackets().Tag("VIDEO_PATH").Set(); + cc->InputSidePackets().Tag("MODEL_PATH").Set(); + cc->Outputs().Tag("DETECTIONS").Set(); + return absl::OkStatus(); + } + + absl::Status Open(CalculatorContext* cc) override { + video_path_ = cc->InputSidePackets().Tag("VIDEO_PATH").Get(); + model_path_ = cc->InputSidePackets().Tag("MODEL_PATH").Get(); + return absl::OkStatus(); + } + + absl::Status Process(CalculatorContext* cc) override { + int total_detections = run_decode_infer(); + + cc->Outputs() + .Tag("DETECTIONS") + .AddPacket(MakePacket(total_detections) + .At(cc->InputTimestamp())); + return absl::OkStatus(); + } + + private: + std::string video_path_; + std::string model_path_; + + // face-detection-adas-0001 input geometry + static constexpr int kModelH = 384; + static constexpr int kModelW = 672; + + int run_decode_infer() { + // ---------------------------------------------------------------- + // 1. Open standalone context (CPU, no GPU model needed) + // ---------------------------------------------------------------- + imp_context_t* ctx = nullptr; + imp_status_t st = imp_context_create(&ctx, nullptr); + if (st != IMP_OK) { + LOG(ERROR) << "[VideoDecodeInferCalculator] imp_context_create failed: " << st; + return -1; + } + + // ---------------------------------------------------------------- + // 2. Open video — source dimensions come from the file + // ---------------------------------------------------------------- + imp_video_source_t* src = nullptr; + imp_video_source_create(&src, IMP_SOURCE_FILE); + imp_video_source_set(src, "path", video_path_.c_str()); + + imp_video_stream_t* stream = nullptr; + st = imp_video_open(&stream, src, ctx, nullptr); + imp_video_source_destroy(src); + + if (st != IMP_OK) { + LOG(ERROR) << "[VideoDecodeInferCalculator] imp_video_open failed: " << st + << " ctx_error: " << imp_context_get_error(ctx); + imp_context_destroy(ctx); + return -1; + } + + uint32_t frame_w = 0, frame_h = 0; + imp_video_get_info(stream, &frame_w, &frame_h, nullptr, nullptr); + LOG(INFO) << "[VideoDecodeInferCalculator] Video: " << frame_w << "x" << frame_h; + + // ---------------------------------------------------------------- + // 3. Load face-detection-adas-0001 — expects [1,3,384,672] BGR FP32 + // ---------------------------------------------------------------- + ov::Core core; + ov::CompiledModel model; + try { + model = core.compile_model(model_path_, "CPU"); + } catch (const std::exception& e) { + LOG(ERROR) << "[VideoDecodeInferCalculator] model load failed: " << e.what(); + imp_video_close(stream); + imp_context_destroy(ctx); + return -1; + } + + ov::InferRequest req = model.create_infer_request(); + // Get actual model input shape + auto in_shape = model.input(0).get_shape(); // [1,3,H,W] + int model_h = static_cast(in_shape[2]); + int model_w = static_cast(in_shape[3]); + LOG(INFO) << "[VideoDecodeInferCalculator] Model input: " + << model_w << "x" << model_h; + + // ---------------------------------------------------------------- + // 4. Decode loop + // ---------------------------------------------------------------- + int frame_count = 0; + int total_detections = 0; + const float detection_threshold = 0.5f; + + std::vector bgr_buf; + + for (;;) { + imp_tensor_t* tensor = nullptr; + st = imp_video_read_frame(&tensor, stream, 0); + if (st == IMP_ERROR_STREAM_END) break; + if (st != IMP_OK || !tensor) break; + + frame_count++; + + // Get NV12 plane pointers via public API (tensor struct is opaque) + const uint8_t* y_ptr = nullptr; + const uint8_t* uv_ptr = nullptr; + int fw = 0, fh = 0; + if (imp_tensor_get_nv12_planes(tensor, &y_ptr, &uv_ptr, &fw, &fh) != IMP_OK) + continue; + + // NV12 → BGR float, resize to model input + nv12_to_bgr_float(y_ptr, uv_ptr, + fw, fh, + model_w, model_h, + bgr_buf); + + // NHWC → NCHW transpose + std::vector nchw(bgr_buf.size()); + for (int c = 0; c < 3; c++) + for (int y = 0; y < model_h; y++) + for (int x = 0; x < model_w; x++) + nchw[c * model_h * model_w + y * model_w + x] = + bgr_buf[(y * model_w + x) * 3 + c]; + + ov::Shape shape = {1, 3, + static_cast(model_h), + static_cast(model_w)}; + ov::Tensor input_tensor(ov::element::f32, shape, nchw.data()); + req.set_input_tensor(input_tensor); + req.infer(); + + // Output: [1,1,N,7] columns: [img_id, label, conf, x1,y1,x2,y2] + auto out = req.get_output_tensor(0); + const float* det = out.data(); + size_t num_dets = out.get_shape()[2]; + + int frame_dets = 0; + for (size_t i = 0; i < num_dets; i++) { + float conf = det[i * 7 + 2]; + if (conf > detection_threshold) frame_dets++; + } + total_detections += frame_dets; + + if (frame_count <= 5 || frame_count % 50 == 0) + LOG(INFO) << "[VideoDecodeInferCalculator] frame " << frame_count + << " detections=" << frame_dets; + } + + LOG(INFO) << "[VideoDecodeInferCalculator] Done: " << frame_count + << " frames, total detections=" << total_detections; + + imp_video_close(stream); + imp_context_destroy(ctx); + return total_detections; + } +}; + +REGISTER_CALCULATOR(VideoDecodeInferCalculator); + +} // namespace mediapipe diff --git a/src/test/mediapipe/calculators/video_decode_infer_test.cpp b/src/test/mediapipe/calculators/video_decode_infer_test.cpp new file mode 100644 index 0000000000..c6cc1e0934 --- /dev/null +++ b/src/test/mediapipe/calculators/video_decode_infer_test.cpp @@ -0,0 +1,202 @@ +// Copyright (c) 2025 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * video_decode_infer_test — standalone end-to-end smoke test. + * + * Usage: + * ./video_decode_infer_test + * + * Returns exit code 0 if at least one detection found across all frames, + * non-zero otherwise. + */ + +#include +#include +#include +#include + +#include + +#include "src/mpi/intel_mpi.h" + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-function" +// ============================================================================ +// NV12 → BGR float32 (nearest-neighbour resize, BT.601 limited range) +// Output: NCHW [3 × H × W] float, values [0,1] +// ============================================================================ +static void nv12_to_bgr_float_nchw(const uint8_t* y_plane, + const uint8_t* uv_plane, + int src_w, int src_h, + int dst_w, int dst_h, + std::vector& out_nchw) { + out_nchw.resize(static_cast(3) * dst_h * dst_w); + + const float scale_x = static_cast(src_w) / dst_w; + const float scale_y = static_cast(src_h) / dst_h; + + // Pointers to B/G/R planes in NCHW output + float* B = out_nchw.data(); + float* G = B + static_cast(dst_h) * dst_w; + float* R = G + static_cast(dst_h) * dst_w; + + for (int dy = 0; dy < dst_h; dy++) { + int sy = static_cast(dy * scale_y); + if (sy >= src_h) sy = src_h - 1; + int uv_row = (sy / 2) * src_w; + + for (int dx = 0; dx < dst_w; dx++) { + int sx = static_cast(dx * scale_x); + if (sx >= src_w) sx = src_w - 1; + + float Y = static_cast(y_plane[sy * src_w + sx]) - 16.0f; + + int uv_col = (sx / 2) * 2; + float U = static_cast(uv_plane[uv_row + uv_col]) - 128.0f; + float V = static_cast(uv_plane[uv_row + uv_col + 1]) - 128.0f; + + float r = (1.164f * Y + 1.596f * V); + float g = (1.164f * Y - 0.392f * U - 0.813f * V); + float b = (1.164f * Y + 2.017f * U); + + auto clamp255 = [](float v) { return v < 0.f ? 0.f : (v > 255.f ? 255.f : v); }; + + size_t pix = static_cast(dy) * dst_w + dx; + B[pix] = clamp255(b); + G[pix] = clamp255(g); + R[pix] = clamp255(r); + } + } +} +#pragma GCC diagnostic pop + +int main(int argc, char* argv[]) { + if (argc < 3) { + std::cerr << "Usage: " << argv[0] << " \n"; + return 2; + } + const char* video_path = argv[1]; + const char* model_path = argv[2]; + + // ---- Open standalone context ----------------------------------------- + imp_context_t* ctx = nullptr; + imp_status_t st = imp_context_create(&ctx, nullptr); + if (st != IMP_OK) { + std::cerr << "imp_context_create failed: " << st << "\n"; + return 1; + } + + // ---- Open video --------------------------------------------------------- + imp_video_source_t* src = nullptr; + imp_video_source_create(&src, IMP_SOURCE_FILE); + imp_video_source_set(src, "path", video_path); + + imp_video_stream_t* stream = nullptr; + st = imp_video_open(&stream, src, ctx, nullptr); + imp_video_source_destroy(src); + + if (st != IMP_OK) { + std::cerr << "imp_video_open failed: " << st + << " (" << imp_context_get_error(ctx) << ")\n"; + imp_context_destroy(ctx); + return 1; + } + + uint32_t frame_w = 0, frame_h = 0; + imp_video_get_info(stream, &frame_w, &frame_h, nullptr, nullptr); + std::cout << "Video opened: " << frame_w << "x" << frame_h << "\n"; + + // ---- Load model --------------------------------------------------------- + ov::Core core; + ov::CompiledModel model; + try { + model = core.compile_model(model_path, "CPU"); + } catch (const std::exception& e) { + std::cerr << "model compile failed: " << e.what() << "\n"; + imp_video_close(stream); + imp_context_destroy(ctx); + return 1; + } + + auto in_shape = model.input(0).get_shape(); // [1,3,H,W] + int model_h = static_cast(in_shape[2]); + int model_w = static_cast(in_shape[3]); + std::cout << "Model input: " << model_w << "x" << model_h << "\n"; + + ov::InferRequest req = model.create_infer_request(); + + // ---- Decode + infer loop ------------------------------------------------ + int frame_count = 0; + int total_detections = 0; + const float threshold = 0.5f; + std::vector nchw_buf; + + for (;;) { + imp_tensor_t* tensor = nullptr; + st = imp_video_read_frame(&tensor, stream, 0); + if (st == IMP_ERROR_STREAM_END) break; + if (st != IMP_OK || !tensor) { + std::cerr << "read_frame failed: " << st << "\n"; + break; + } + + frame_count++; + + const uint8_t* y_ptr = nullptr; + const uint8_t* uv_ptr = nullptr; + int fw = 0, fh = 0; + imp_tensor_get_nv12_planes(tensor, &y_ptr, &uv_ptr, &fw, &fh); + + nv12_to_bgr_float_nchw(y_ptr, uv_ptr, + fw, fh, + model_w, model_h, nchw_buf); + + ov::Tensor input_tensor(ov::element::f32, + {1, 3, + static_cast(model_h), + static_cast(model_w)}, + nchw_buf.data()); + req.set_input_tensor(input_tensor); + req.infer(); + + auto out = req.get_output_tensor(0); + const float* det = out.data(); + size_t num_dets = out.get_shape()[2]; + + int frame_dets = 0; + for (size_t i = 0; i < num_dets; i++) { + float conf = det[i * 7 + 2]; + if (conf > threshold) frame_dets++; + } + total_detections += frame_dets; + + if (frame_count <= 5 || frame_count % 50 == 0) + std::cout << " frame " << frame_count + << " detections=" << frame_dets << "\n"; + } + + std::cout << "Done: " << frame_count << " frames, " + << "total detections=" << total_detections << "\n"; + + imp_video_close(stream); + imp_context_destroy(ctx); + + if (total_detections < 1) { + std::cerr << "FAIL: expected at least 1 detection\n"; + return 1; + } + std::cout << "PASS\n"; + return 0; +} diff --git a/third_party/BUILD b/third_party/BUILD index dcde0bd7e4..e9c08dd0b4 100644 --- a/third_party/BUILD +++ b/third_party/BUILD @@ -58,4 +58,13 @@ alias( "//conditions:default": "@linux_curl//:curl", }), visibility = ["//visibility:public"], +) + +alias( + name = "gstreamer", + actual = select({ + "//src:windows": "@windows_gstreamer//:gstreamer", + "//conditions:default": "@linux_gstreamer//:gstreamer_headers", + }), + visibility = ["//visibility:public"], ) \ No newline at end of file diff --git a/third_party/gstreamer/gstreamer_linux.BUILD b/third_party/gstreamer/gstreamer_linux.BUILD new file mode 100644 index 0000000000..4efc0a6b56 --- /dev/null +++ b/third_party/gstreamer/gstreamer_linux.BUILD @@ -0,0 +1,53 @@ +# +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# GStreamer headers for Linux. +# +# GStreamer is built from source in the gstreamer-builder Docker stage +# and installed to /opt/gstreamer (prefix). The glib headers are copied +# into the same prefix so the tree is self-contained. +# +# Layout expected under /opt/gstreamer: +# include/gstreamer-1.0/ GStreamer core + plugins headers +# include/glib-2.0/ GLib/GObject headers (copied from system) +# lib/gstreamer-1.0/include/ per-plugin generated headers (if any) +# +# NOTE: This target provides HEADERS ONLY — no linkopts. +# src/mpi:gst_loader dlopens the shared libs at runtime so the OVMS +# binary starts without GStreamer present on end-user systems. + +package(default_visibility = ["//visibility:public"]) + +cc_library( + name = "gstreamer_headers", + hdrs = glob([ + "include/gstreamer-1.0/**/*.h", + "include/glib-2.0/**/*.h", + "lib/gstreamer-1.0/include/**/*.h", + ], allow_empty = True), + includes = [ + "include/gstreamer-1.0", + "include/glib-2.0", + "lib/gstreamer-1.0/include", + ], + visibility = ["//visibility:public"], +) + +cc_library( + name = "gstreamer", + deps = [":gstreamer_headers"], + visibility = ["//visibility:public"], +)