Pentaho Academy Beta site ..

MLflow

MLflow

MLflow is an open-source platform for managing the machine learning lifecycle. It's designed to help data scientists and ML engineers track, reproduce, and deploy machine learning models more effectively.

MLflow contains the following components.

  • MLflow Tracking - An engineer will use this feature the most. It allows experiments to be recorded and queried. It also keeps track of the code, data, configuration and results for each experiment.

  • MLflow Projects - Allows experiments to be reproduced by packaging the code into a platform agnostic format.

  • MLflow Models - Deploys machine learning models to an environment where they can be served.

  • MLflow Repositories - Allows for the storage, annotation, discovery, and management of models in a central repository.

MLflow

Docker & Docker-Compose

  1. Create MLflow folder and copy over scripts:

# Create MLflow folder and subfolder data
mkdir -p ~/MLflow/{data,mlflow}

# Copy all the MLflow files from Workshop--Data-Catalog/MLflow
cp -r ~/Workshop--Data-Catalog/MLflow/* ~/MLflow/
mv ./Dockerfile ~MLflow/mlflow/

Here's the docker-compose.yml file:

services:
  db:
    restart: always
    image: postgres:15  # Added specific version for consistency
    container_name: mlflow_db
    expose:
      - "${PG_PORT}"
    networks:
      - backend
    environment:
      - POSTGRES_USER=${PG_USER}
      - POSTGRES_PASSWORD=${PG_PASSWORD}
      - POSTGRES_DB=${PG_DATABASE}
    volumes:
      - db_data:/var/lib/postgresql/data/
    healthcheck:
      test: ["CMD", "pg_isready", "-p", "5432", "-U", "${PG_USER}"]
      interval: 5s
      timeout: 5s
      retries: 3

  s3:
    restart: always
    image: minio/minio:RELEASE.2025-04-22T22-12-26Z  # Last version with full admin UI
    container_name: mlflow_minio
    volumes:
      - minio_data:/data
    ports:
      - "${MINIO_PORT}:9000"
      - "${MINIO_CONSOLE_PORT}:9001"
    networks:
      - frontend
      - backend
    environment:
      - MINIO_ROOT_USER=${MINIO_ROOT_USER}
      - MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD}
      - MINIO_ADDRESS=${MINIO_ADDRESS}
      - MINIO_PORT=${MINIO_PORT}
      - MINIO_STORAGE_USE_HTTPS=${MINIO_STORAGE_USE_HTTPS}
      - MINIO_CONSOLE_ADDRESS=${MINIO_CONSOLE_ADDRESS}
    command: server /data --console-address ":9001"
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
      interval: 30s
      timeout: 20s
      retries: 3

  tracking_server:
    restart: always
    build: ./mlflow
    image: mlflow_server
    container_name: mlflow_server
    depends_on:
      - db
      - s3
    ports:
      - "${MLFLOW_PORT}:5000"
    networks:
      - frontend
      - backend
    environment:
      - AWS_ACCESS_KEY_ID=${MINIO_ACCESS_KEY}
      - AWS_SECRET_ACCESS_KEY=${MINIO_SECRET_ACCESS_KEY}
      - MLFLOW_S3_ENDPOINT_URL=http://s3:${MINIO_PORT}
      - MLFLOW_S3_IGNORE_TLS=true
    command: >
      mlflow server
      --backend-store-uri postgresql://${PG_USER}:${PG_PASSWORD}@db:${PG_PORT}/${PG_DATABASE}
      --host 0.0.0.0
      --serve-artifacts
      --artifacts-destination s3://${MLFLOW_BUCKET_NAME}
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:5000/"]
      interval: 30s
      timeout: 10s
      retries: 3

volumes:
  db_data:
  minio_data:

networks:
  frontend:
    driver: bridge
  backend:
    driver: bridge
  • both MinIO and PostgreSQL are using the local file system to store data.

Last updated

Was this helpful?