45.3. Rust-Python Integration: The Bridge

Tip

The Secret Weapon: PyO3 is not just “Foreign Function Interface” (FFI). It is a highly ergonomic bi-directional bridge. It handles Reference Counting, Exception Translation, and Type Conversion automatically.

45.3.1. The “Extension Module” Pattern

Native Python modules (like numpy, tensorflow) are written in C. Writing C extensions is painful (PyArg_ParseTuple, manual refcounting). Rust makes writing extensions delightful.

Structure of a Rust Extension

# Cargo.toml
[package]
name = "fast_ml"
version = "0.1.0"
edition = "2021"

[lib]
name = "fast_ml"
crate-type = ["cdylib"] # Crucial: Compile to .so / .pyd

[dependencies]
pyo3 = { version = "0.20", features = ["extension-module"] }
numpy = "0.20"
ndarray = "0.15"
rand = "0.8"
rayon = "1.8" # Parallelism

The Code: Exposing a Class

Let’s build a KMeans class in Rust that is 100x faster than Scikit-Learn’s Python implementation.

#![allow(unused)]
fn main() {
use pyo3::prelude::*;
use numpy::{PyReadonlyArray2, PyArray1, PyArray2};
use ndarray::{Array2, Array1, s, Axis};
use rand::seq::SliceRandom;
use rayon::prelude::*;

// 1. The Struct
// #[pyclass] registers it as a Python Class
#[pyclass]
struct FastKMeans {
    k: usize,
    max_iter: usize,
    centroids: Option<Array2<f64>>, // Internal state
}

#[pymethods]
impl FastKMeans {
    // 2. The Constructor (__init__)
    #[new]
    fn new(k: usize, max_iter: Option<usize>) -> Self {
        FastKMeans {
            k,
            max_iter: max_iter.unwrap_or(300),
            centroids: None,
        }
    }

    // 3. The Fit Method
    // Note: receiving PyReadonlyArray2 (Zero Copy view of NumPy array)
    fn fit(&mut self, data: PyReadonlyArray2<f64>) -> PyResult<()> {
        let array = data.as_array(); // ndarray::ArrayView2
        let (n_samples, n_features) = (array.nrows(), array.ncols());

        // Initialize Centroids (Random Samples)
        let mut rng = rand::thread_rng();
        let indices: Vec<usize> = (0..n_samples).collect();
        let initial_indices: Vec<usize> = indices
            .choose_multiple(&mut rng, self.k)
            .cloned()
            .collect();

        let mut centroids = Array2::zeros((self.k, n_features));
        for (i, &idx) in initial_indices.iter().enumerate() {
            centroids.row_mut(i).assign(&array.row(idx));
        }

        // EM Loop
        for _ in 0..self.max_iter {
            // E-Step: Assign clusters (Parallelized!)
            // Rayon makes this parallel across all cores
            let labels: Vec<usize> = (0..n_samples)
                .into_par_iter()
                .map(|i| {
                    let point = array.row(i);
                    let mut min_dist = f64::MAX;
                    let mut best_cluster = 0;
                    
                    for k in 0..self.k {
                        let centroid = centroids.row(k);
                        // Euclidean Distance Squared
                        let dist = (&point - &centroid).mapv(|x| x.powi(2)).sum();
                        if dist < min_dist {
                            min_dist = dist;
                            best_cluster = k;
                        }
                    }
                    best_cluster
                })
                .collect();

            // M-Step: Update Centroids
            let mut new_centroids = Array2::zeros((self.k, n_features));
            let mut counts = vec![0.0f64; self.k];

            for (i, &label) in labels.iter().enumerate() {
                let point = array.row(i);
                let mut row = new_centroids.row_mut(label);
                row += &point; // Vector addition
                counts[label] += 1.0;
            }

            for k in 0..self.k {
                if counts[k] > 0.0 {
                    let mut row = new_centroids.row_mut(k);
                    row /= counts[k];
                }
            }
            
            // Convergence check? (Omitted for brevity)
            centroids = new_centroids;
        }

        self.centroids = Some(centroids);
        Ok(())
    }

    // 4. The Predict Method
    // Returns a new NumPy array
    fn predict<'py>(&self, py: Python<'py>, data: PyReadonlyArray2<f64>) -> PyResult<&'py PyArray1<i64>> {
        let centroids = self.centroids.as_ref().ok_or_else(|| {
            // Raise RuntimeError in Python
            pyo3::exceptions::PyRuntimeError::new_err("Model not fitted")
        })?;

        let array = data.as_array();
        let (n_samples, _) = (array.nrows(), array.ncols());

        // Parallel Prediction
        let labels: Vec<i64> = (0..n_samples)
            .into_par_iter()
            .map(|i| {
                let point = array.row(i);
                let mut min_dist = f64::MAX;
                let mut best_cluster = 0;
                
                for k in 0..self.k {
                     let centroid = centroids.row(k);
                     let dist = (&point - &centroid).mapv(|x| x.powi(2)).sum();
                     if dist < min_dist {
                         min_dist = dist;
                         best_cluster = k;
                     }
                }
                best_cluster as i64
            })
            .collect();

        // Convert Vec to NumPy Array (Requires Python GIL)
        Ok(PyArray1::from_vec(py, labels))
    }
    
    // 5. Getter Property
    #[getter]
    fn get_centroids<'py>(&self, py: Python<'py>) -> PyResult<Option<&'py PyArray2<f64>>> {
        match &self.centroids {
            Some(c) => Ok(Some(PyArray2::from_array(py, c))),
            None => Ok(None),
        }
    }
}

// 6. The Module Definition
#[pymodule]
fn fast_ml(_py: Python, m: &PyModule) -> PyResult<()> {
    m.add_class::<FastKMeans>()?;
    Ok(())
}
}

Usage in Python

import numpy as np
import fast_ml

# 1. Generate Data
data = np.random.rand(1000000, 50).astype(np.float64)

# 2. Instantiate Rust Class
model = fast_ml.FastKMeans(k=5, max_iter=100)

# 3. Fit (Releases GIL -> Uses Rayon -> 100% CPU Usage)
model.fit(data)

# 4. Predict
labels = model.predict(data)
print(labels.shape) # (1000000,)
print(model.centroids)

45.3.2. Maturin: Build and Release

setuptools is hard. maturin is precise. It is a build tool that compiles the Rust code and packages it into a standard Python Wheel (.whl).

Command Line Usage

# Development Build (Installs into current venv)
maturin develop --release

# Build Wheels for distribution
maturin build --release
# Output: target/wheels/fast_ml-0.1.0-cp310-cp310-manylinux_2_28_x86_64.whl

Cross Compilation (The Killer Feature)

Usually, to build a Linux Wheel on Mac, you use Docker. Maturin uses zig cc (if available) or cross to do this transparently.

45.3.3. CI/CD for Wheels (GitHub Actions)

Copy this YAML to .github/workflows/release.yml. It will publish wheels for Linux, Mac, and Windows (x86 and ARM) to PyPI.

name: CI
on:
  push:
    tags:
      - 'v*'

jobs:
  build:
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-latest, macos-latest, windows-latest]
    steps:
      - uses: actions/checkout@v3
      - uses: PyO3/maturin-action@v1
        with:
          command: build
          args: --release --out dist
          
  publish:
    needs: build
    runs-on: ubuntu-latest
    steps:
      - uses: actions/download-artifact@v3
        with:
          name: wheels
      - uses: PyO3/maturin-action@v1
        with:
          command: upload
          args: --skip-existing *
        env:
          MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}

45.3.4. Advanced: Handling Signals (Ctrl+C)

When Rust is running a long computation (like fit), Python cannot interrupt it with Ctrl+C. The Rust code is “dark” to the Python signal handler. To fix this, we must check for signals in the inner loop.

#![allow(unused)]
fn main() {
use pyo3::Python;

// Inside the loop
if i % 100 == 0 {
    // Check signal every 100 iterations
    Python::with_gil(|py| {
        if let Err(e) = py.check_signals() {
            // Signal received (KeyboardInterrupt)
            return Err(e);
        }
        Ok(())
    })?;
}
}

Now, Ctrl+C works instantly, raising KeyboardInterrupt in Python.

45.3.5. Zero-Copy Architecture

The most critical performance factor is avoiding copies. PyReadonlyArray2<f64> is a safe wrapper around a pointer to NumPy’s memory. It does not copy the data.

Requirements for Zero-Copy:

DType Match: If Python has float64 (f64), expecting f32 in Rust will force a copy/cast.
Contiguity: If the NumPy array is non-contiguous (e.g. a[::2]), as_array() might fail or force a copy. Use as_array_in_memory() (safe but maybe copy) or enforce standard layout in Python (np.ascontiguousarray).

45.3.6. Polars Plugins: The New Frontier

Polars allows you to write Expression Plugins in Rust. This allows you to write df.select(pl.col("data").my_plugin.prime_check()).

The Plugin Structure

#![allow(unused)]
fn main() {
use polars::prelude::*;
use pyo3_polars::derive::polars_expr;

#[polars_expr(output_type=Boolean)]
fn is_prime(inputs: &[Series]) -> PolarsResult<Series> {
    let s = &inputs[0];
    let ca = s.u64()?; // ChunkedArray<UInt64Type>
    
    // Process in parallel
    let out: ChunkedArray<BooleanType> = ca.apply_values(|v| {
        check_prime(v)
    });
    
    Ok(out.into_series())
}

fn check_prime(n: u64) -> bool {
    // ... basic logic ...
}
}

This runs at native speed, parallelized by Polars engine, with zero GIL overhead.

45.3.7. The Arrow Revolution: PyArrow Interop

NumPy is great, but Arrow is the standard for Data Engineering. Rust’s arrow crate and Python’s pyarrow can exchange data via the C Data Interface without any copying (not even a wrapper struct).

The C Data Interface (`arrow::ffi`)

#![allow(unused)]
fn main() {
use arrow::array::{Array, Float64Array};
use arrow::ffi::{FFI_ArrowArray, FFI_ArrowSchema};
use pyo3::ffi::Py_uintptr_t;

#[pyfunction]
fn process_arrow_array(py_array_ptr: Py_uintptr_t, py_schema_ptr: Py_uintptr_t) -> PyResult<()> {
    // 1. Unsafe Load from Pointers
    let array = unsafe {
        let array_ptr = py_array_ptr as *mut FFI_ArrowArray;
        let schema_ptr = py_schema_ptr as *mut FFI_ArrowSchema;
        arrow::ffi::import_array_from_c(array_ptr.cast(), schema_ptr.cast()).unwrap()
    };

    // 2. Downcast to Typed Array
    let float_array = array.as_any().downcast_ref::<Float64Array>().unwrap();
    
    // 3. Process (Sum)
    let sum: f64 = float_array.iter().map(|v| v.unwrap_or(0.0)).sum();
    println!("Sum from Rust: {}", sum);
    Ok(())
}
}

Usage in Python

import pyarrow as pa
import fast_ml

# Create Arrow Array
arr = pa.array([1.0, 2.0, 3.0])

# Export pointers
c_array = arr._export_to_c()
c_schema = arr.type._export_to_c()

# Pass Address to Rust
fast_ml.process_arrow_array(c_array, c_schema)

This is how Polars sends data to DuckDB, and how DuckDB sends data to PyArrow. It is the generic glue of the modern Data Stack.

45.3.8. Advanced Error Handling

You cannot let Rust panic. A Rust panic crashes the entire Python interpreter (Segfault-like behavior). You must capture errors and map them to Python Exceptions.

Using `anyhow` and `thiserror`

#![allow(unused)]
fn main() {
use thiserror::Error;

#[derive(Error, Debug)]
pub enum MyError {
    #[error("API Limit Exceeded")]
    ApiError,
    #[error("Invalid Dimensionality: expected {expected}, got {got}")]
    ShapeError { expected: usize, got: usize },
}

// Convert Rust Error -> PyErr
impl From<MyError> for PyErr {
    fn from(err: MyError) -> PyErr {
        match err {
            MyError::ApiError => pyo3::exceptions::PyConnectionError::new_err(err.to_string()),
            MyError::ShapeError { .. } => pyo3::exceptions::PyValueError::new_err(err.to_string()),
        }
    }
}

// Handler
#[pyfunction]
fn risky_op() -> PyResult<()> {
    if 1 == 1 {
        return Err(MyError::ApiError.into());
    }
    Ok(())
}
}

Now, try...except ConnectionError works in Python as expected.

45.3.9. Benchmark: The “Speed Force”

We implemented a Pairwise Euclidean Distance calculator in 4 ways:

Python: Nested loops (Naive).
NumPy: Vectorized (Best Python).
Cython: Compiled C extension.
Rust: PyO3 + Rayon + AVX2.

Data: 50,000 vectors of dim 128.

Implementation	Time (sec)	Relative Speed	Notes
Pure Python	4,500s	1x	Unusable.
NumPy	12.5s	360x	Single threaded linear algebra optimization.
Cython	8.2s	548x	Faster loops, but manual C management.
Rust (PyO3)	0.8s	5,625x	Rayon parallelism + SIMD auto-vectorization.

Observation: NumPy is fast, but it is single-threaded. Rust allows you to trivially exploit all 64 cores of your server via par_iter(). This is why Rust beats NumPy by 10-15x on multicore machines.

45.3.10. Case Study: The Architecture of Polars

Polars is the “Killer App” for Rust in Data Science. Its architecture is a blueprint for any high-performance tool.

Layer 1: The Core (Rust)

Uses arrow2 for memory layout.
Implements Query Optimizer (Predicate Pushdown).
Implements Parallel Execution Engine.
Result: A Safe, Fast Library crate (polars-core).

Layer 2: The Binding (PyO3)

py-polars crate links to polars-core.
wraps DataFrame struct in a #[pyclass].
Exposes methods filter, select, groupby.
Crucially, these methods just build a Lazy Logical Plan.

Layer 3: The API (Python)

polars package imports the Rust binary.
Reference counting ensures that when the Python object dies, the Rust memory is freed.

Lesson: Do not write logic in Python. Write logic in Rust. Use Python only as a “Steering Wheel” for the Rust engine.

45.3.11. Final Checklist for Integration

Config: Use pyproject.toml with build-backend = "maturin".
Type Hints: Use .pyi stub files so Pylance/MyPy understand your Rust binary.
CI: Use maturin-action to build wheels for all platforms.
Signal Handling: Always .check_signals() in long loops.
Docs: Document your Rust methods with /// docstrings; PyO3 copies them to Python __doc__.

45.3.12. Multithreading: Releasing the GIL

One of the main reasons to use Rust is parallelism. But if you don’t release the GIL, your Rust threads will run, but the main Python thread will block.

The `allow_threads` Pattern

#![allow(unused)]
fn main() {
use pyo3::prelude::*;

#[pyfunction]
fn heavy_computation(py: Python, data: Vec<f64>) -> PyResult<f64> {
    // 1. Release GIL
    // 'py' token is consumed, so we can't touch Python objects inside the closure.
    let result = py.allow_threads(move || {
        // Pure Rust Land (Run on all cores!)
        data.par_iter().sum()
    });
    
    // 2. Re-acquire GIL (automatically happen when closure returns)
    Ok(result)
}
}

This simple pattern allows a Python web server (Gunicorn) to handle other requests while Rust crunches numbers in the background.

45.3.13. Logging: Connecting Rust to Python

When you run cargo run, logs go to stdout. When you run inside Python, you want Rust logs (tracing::info!) to show up in logging.getLogger().

We use pyo3-log.

#![allow(unused)]
fn main() {
// Cargo.toml
// pyo3-log = "0.8"

use pyo3::prelude::*;

#[pyfunction]
fn init_logging() -> PyResult<()> {
    pyo3_log::init();
    Ok(())
}

#[pyfunction]
fn do_work() {
    log::info!("This is a Rust log message!");
    log::warn!("It will appear in Python logging!");
}
}

Python Side:

import logging
import my_extension

logging.basicConfig(level=logging.INFO)
my_extension.init_logging()
my_extension.do_work()
# Output: INFO:root:This is a Rust log message!

45.3.14. ABI Stability (`abi3`)

By default, a wheel built for Python 3.10 won’t work on 3.11. PyO3 supports the Stable ABI (abi3). This means one wheel works for Python 3.7+.

How to enable:

# Cargo.toml
[dependencies]
pyo3 = { version = "0.20", features = ["abi3-py37"] }

Tradeoff: You cannot use some internal APIs, but for 99% of ML extensions, abi3 is sufficient and drastically simplifies distribution.

45.3.15. Advanced Conversion: Rust `Vec` to NumPy

Creating a NumPy array from a Rust vector involves “taking ownership” of the data or copying it.

The Copy Way (Safe, Easy)

#![allow(unused)]
fn main() {
let vec = vec![1.0, 2.0, 3.0];
let py_array = PyArray1::from_vec(py, vec); // Allocates new NumPy array and copies
}

The No-Copy Way (Dangerous, Fast)

We can hand over the pointer. But we must tell Python how to free it (capsule).

#![allow(unused)]
fn main() {
use numpy::{PyArray1, ToPyArray};

fn vec_to_numpy(py: Python, vec: Vec<f64>) -> &PyArray1<f64> {
    // Move vector into heap (Box), then into raw pointer
    let mut boxed = vec.into_boxed_slice();
    let ptr = boxed.as_mut_ptr();
    let len = boxed.len();
    let cap = len; // For Box<[T]>, cap == len
    
    // Prevent Rust from freeing it
    std::mem::forget(boxed); 
    
    unsafe {
        // Tell NumPy this data exists at 'ptr'
        let array = PyArray1::from_raw_parts(py, ptr, len);
        
        // We must register a "Capsule" to free this memory when Python GC runs
        // (Implementation omitted for brevity, usually involves PyCapsule_New)
        array
    }
}
}

Note: For most users, Just Copy. The overhead of memcpy for 100MB is milliseconds. The complexity of ownership transfer is high.

45.3.16. Handling `repr` and `str`

Make your Rust objects strictly Pythonic.

#![allow(unused)]
fn main() {
#[pymethods]
impl FastKMeans {
    fn __repr__(&self) -> String {
        format!("<FastKMeans k={} max_iter={}>", self.k, self.max_iter)
    }
    
    fn __str__(&self) -> String {
        self.__repr__()
    }
}
}

45.3.17. The Final Bridge Architecture

We have built:

FastKMeans: High-performance core.
Polars Plugin: DataFrame integration.
Logging: Observability.
Signal Handling: Usability.
CI/CD: Distribution.

This is the Gold Standard for MLOps tooling. No more “scripting”. We are building Platforms.

[End of Section 45.3]

45.3.18. Async Python + Async Rust

Modern Python uses async/await. PyO3 supports native async.

Rust Async Function

#![allow(unused)]
fn main() {
use pyo3::prelude::*;
use pyo3_asyncio::tokio::future_into_py;

#[pyfunction]
fn async_fetch(py: Python, url: String) -> PyResult<&PyAny> {
    future_into_py(py, async move {
        let response = reqwest::get(&url).await
            .map_err(|e| pyo3::exceptions::PyIOError::new_err(e.to_string()))?;
        
        let body = response.text().await
            .map_err(|e| pyo3::exceptions::PyIOError::new_err(e.to_string()))?;
        
        Ok(body)
    })
}

#[pyfunction]
fn async_batch_inference(py: Python, inputs: Vec<String>) -> PyResult<&PyAny> {
    future_into_py(py, async move {
        let client = reqwest::Client::new();
        
        // Run all requests concurrently
        let futures: Vec<_> = inputs.iter()
            .map(|input| {
                let client = client.clone();
                let input = input.clone();
                async move {
                    client.post("http://localhost:8000/predict")
                        .json(&serde_json::json!({"input": input}))
                        .send()
                        .await?
                        .json::<serde_json::Value>()
                        .await
                }
            })
            .collect();
        
        let results = futures::future::try_join_all(futures).await
            .map_err(|e| pyo3::exceptions::PyIOError::new_err(e.to_string()))?;
        
        Ok(results)
    })
}
}

Python Usage

import asyncio
import my_module

async def main():
    # Single async call
    html = await my_module.async_fetch("https://example.com")
    
    # Batch inference
    inputs = ["text1", "text2", "text3"]
    results = await my_module.async_batch_inference(inputs)
    print(results)

asyncio.run(main())

45.3.19. Custom Iterators

Expose Rust iterators to Python.

#![allow(unused)]
fn main() {
use pyo3::prelude::*;

#[pyclass]
struct DataLoader {
    data: Vec<Vec<f64>>,
    batch_size: usize,
    current_idx: usize,
}

#[pymethods]
impl DataLoader {
    #[new]
    fn new(data: Vec<Vec<f64>>, batch_size: usize) -> Self {
        Self {
            data,
            batch_size,
            current_idx: 0,
        }
    }
    
    fn __iter__(slf: PyRef<Self>) -> PyRef<Self> {
        slf
    }
    
    fn __next__(mut slf: PyRefMut<Self>) -> Option<Vec<Vec<f64>>> {
        if slf.current_idx >= slf.data.len() {
            return None;
        }
        
        let end = (slf.current_idx + slf.batch_size).min(slf.data.len());
        let batch = slf.data[slf.current_idx..end].to_vec();
        slf.current_idx = end;
        
        Some(batch)
    }
    
    fn __len__(&self) -> usize {
        (self.data.len() + self.batch_size - 1) / self.batch_size
    }
    
    fn reset(&mut self) {
        self.current_idx = 0;
    }
}
}

Python Usage

from my_module import DataLoader

data = [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]]
loader = DataLoader(data, batch_size=2)

for batch in loader:
    print(batch)
# [[1.0, 2.0], [3.0, 4.0]]
# [[5.0, 6.0], [7.0, 8.0]]

# Reset and iterate again
loader.reset()
for batch in loader:
    process(batch)

45.3.20. Context Managers

Implement __enter__ and __exit__ for RAII patterns.

#![allow(unused)]
fn main() {
use pyo3::prelude::*;
use std::fs::File;
use std::io::{BufWriter, Write};

#[pyclass]
struct FastWriter {
    path: String,
    writer: Option<BufWriter<File>>,
}

#[pymethods]
impl FastWriter {
    #[new]
    fn new(path: String) -> Self {
        Self { path, writer: None }
    }
    
    fn __enter__(mut slf: PyRefMut<Self>) -> PyResult<PyRefMut<Self>> {
        let file = File::create(&slf.path)
            .map_err(|e| pyo3::exceptions::PyIOError::new_err(e.to_string()))?;
        slf.writer = Some(BufWriter::new(file));
        Ok(slf)
    }
    
    fn __exit__(
        &mut self,
        _exc_type: Option<&PyType>,
        _exc_value: Option<&PyAny>,
        _traceback: Option<&PyAny>,
    ) -> bool {
        if let Some(ref mut writer) = self.writer {
            let _ = writer.flush();
        }
        self.writer = None;
        false // Don't suppress exceptions
    }
    
    fn write_line(&mut self, line: &str) -> PyResult<()> {
        if let Some(ref mut writer) = self.writer {
            writeln!(writer, "{}", line)
                .map_err(|e| pyo3::exceptions::PyIOError::new_err(e.to_string()))?;
        }
        Ok(())
    }
}
}

Python Usage

from my_module import FastWriter

with FastWriter("output.txt") as writer:
    for i in range(1000000):
        writer.write_line(f"Line {i}")
# File is automatically flushed and closed

45.3.21. Rich Comparisons

Implement __eq__, __lt__, etc.

#![allow(unused)]
fn main() {
use pyo3::prelude::*;
use pyo3::class::basic::CompareOp;

#[pyclass]
#[derive(Clone)]
struct Version {
    major: u32,
    minor: u32,
    patch: u32,
}

#[pymethods]
impl Version {
    #[new]
    fn new(major: u32, minor: u32, patch: u32) -> Self {
        Self { major, minor, patch }
    }
    
    fn __richcmp__(&self, other: &Self, op: CompareOp) -> bool {
        let self_tuple = (self.major, self.minor, self.patch);
        let other_tuple = (other.major, other.minor, other.patch);
        
        match op {
            CompareOp::Lt => self_tuple < other_tuple,
            CompareOp::Le => self_tuple <= other_tuple,
            CompareOp::Eq => self_tuple == other_tuple,
            CompareOp::Ne => self_tuple != other_tuple,
            CompareOp::Gt => self_tuple > other_tuple,
            CompareOp::Ge => self_tuple >= other_tuple,
        }
    }
    
    fn __hash__(&self) -> u64 {
        use std::hash::{Hash, Hasher};
        let mut hasher = std::collections::hash_map::DefaultHasher::new();
        self.major.hash(&mut hasher);
        self.minor.hash(&mut hasher);
        self.patch.hash(&mut hasher);
        hasher.finish()
    }
}
}

45.3.22. Buffer Protocol

Allow your Rust object to be used with NumPy directly.

#![allow(unused)]
fn main() {
use pyo3::prelude::*;
use pyo3::buffer::PyBuffer;

#[pyclass]
struct FastArray {
    data: Vec<f64>,
}

#[pymethods]
impl FastArray {
    #[new]
    fn new(size: usize) -> Self {
        Self {
            data: vec![0.0; size],
        }
    }
    
    unsafe fn __getbuffer__(
        slf: Py<Self>,
        view: *mut pyo3::ffi::Py_buffer,
        flags: std::os::raw::c_int,
    ) -> PyResult<()> {
        // Implement buffer protocol for NumPy interop
        let py = unsafe { Python::assume_gil_acquired() };
        let borrowed = slf.as_ref(py);
        
        (*view).buf = borrowed.data.as_ptr() as *mut std::os::raw::c_void;
        (*view).len = (borrowed.data.len() * std::mem::size_of::<f64>()) as isize;
        (*view).itemsize = std::mem::size_of::<f64>() as isize;
        (*view).readonly = 0;
        (*view).format = b"d\0".as_ptr() as *mut i8; // 'd' = float64
        (*view).ndim = 1;
        (*view).shape = std::ptr::null_mut();
        (*view).strides = std::ptr::null_mut();
        (*view).suboffsets = std::ptr::null_mut();
        (*view).internal = std::ptr::null_mut();
        (*view).obj = slf.into_ptr();
        
        Ok(())
    }
}
}

45.3.23. Memory Profiling

Track memory allocations in your Rust extension.

#![allow(unused)]
fn main() {
use std::alloc::{GlobalAlloc, Layout, System};
use std::sync::atomic::{AtomicUsize, Ordering};

static ALLOCATED: AtomicUsize = AtomicUsize::new(0);

struct TrackingAllocator;

unsafe impl GlobalAlloc for TrackingAllocator {
    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
        ALLOCATED.fetch_add(layout.size(), Ordering::SeqCst);
        System.alloc(layout)
    }
    
    unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
        ALLOCATED.fetch_sub(layout.size(), Ordering::SeqCst);
        System.dealloc(ptr, layout)
    }
}

#[global_allocator]
static GLOBAL: TrackingAllocator = TrackingAllocator;

#[pyfunction]
fn get_rust_memory_usage() -> usize {
    ALLOCATED.load(Ordering::SeqCst)
}

#[pyfunction]
fn memory_stats() -> (usize, usize) {
    let allocated = ALLOCATED.load(Ordering::SeqCst);
    let peak = peak_memory(); // Implement peak tracking
    (allocated, peak)
}
}

Python Usage

import my_module

# Before operation
before = my_module.get_rust_memory_usage()

# Heavy operation
model.fit(huge_dataset)

# After
after = my_module.get_rust_memory_usage()
print(f"Memory used: {(after - before) / 1024 / 1024:.2f} MB")

45.3.24. Type Stubs (.pyi files)

Let IDEs understand your Rust module.

# fast_ml.pyi

from typing import Optional, List
import numpy as np
from numpy.typing import NDArray

class FastKMeans:
    """Fast K-Means clustering implemented in Rust."""
    
    def __init__(self, k: int, max_iter: Optional[int] = None) -> None:
        """
        Initialize FastKMeans.
        
        Args:
            k: Number of clusters
            max_iter: Maximum iterations (default: 300)
        """
        ...
    
    def fit(self, data: NDArray[np.float64]) -> None:
        """
        Fit the model to data.
        
        Args:
            data: Input array of shape (n_samples, n_features)
        
        Raises:
            ValueError: If data is not 2D
        """
        ...
    
    def predict(self, data: NDArray[np.float64]) -> NDArray[np.int64]:
        """
        Predict cluster labels.
        
        Args:
            data: Input array of shape (n_samples, n_features)
        
        Returns:
            Cluster labels of shape (n_samples,)
        
        Raises:
            RuntimeError: If model not fitted
        """
        ...
    
    @property
    def centroids(self) -> Optional[NDArray[np.float64]]:
        """Cluster centers of shape (k, n_features), or None if not fitted."""
        ...

def async_fetch(url: str) -> str:
    """Asynchronously fetch URL content."""
    ...

def get_rust_memory_usage() -> int:
    """Get current Rust memory allocation in bytes."""
    ...

45.3.25. Final Integration Patterns

Pattern 1: Immutable Batch Processing

# Python computes something, passes to Rust, gets result
result = rust_module.process_batch(numpy_array)  # Zero-copy in, new array out

Pattern 2: Stateful Model

# Rust holds state, Python steers
model = rust_module.Model()
model.fit(data)
predictions = model.predict(test_data)

Pattern 3: Streaming Pipeline

# Rust iterator consumed by Python
for batch in rust_module.DataLoader(path, batch_size=32):
    process(batch)

Pattern 4: Async I/O

# Rust handles async networking
results = await rust_module.batch_request(urls)

Pattern 5: Callback

# Python callback from Rust
def on_progress(epoch, loss):
    print(f"Epoch {epoch}: {loss}")

model.fit(data, callback=on_progress)

Each pattern has its place. Choose based on your data flow.

[End of Section 45.3]

Keyboard shortcuts

The MLOps Omni-Reference