Didn't store data on disk?

i followed this tutorial: Hands-on with Apache Iceberg on Your LapDeep Dive with Apache Spark, Nessie, Minio, Dremio, Polars and Seaborn
but it don’t store data. it stores data on Memory.
So how to store data on disk?
i mean after docker compose down then docker compose up then all data erased.
so how to configure it on docker-compose.yml file?

1 Like

@ganbaatar What command do you use to bring down Dremio, from the document @AlexMerced wrote I can see 2 options

Spinning Down the Environment
To stop and remove all running services, use the following command:

  • docker-compose down

    This stops the services and removes the associated containers, networks, and volumes. Your data will still be preserved in the mounted volumes, so any changes made to your data (such as in Minio or Nessie) will remain intact the next time you spin up the environment.

You can also use the following flags with docker-compose down:

–volumes: This flag will remove all the associated volumes as well. Use this if you want to completely clean up the environment, including any persisted data in volumes.

You can also consult the Nessie documentation at projectnessie.org to configure a backing store like Postgres or mongo

These would be passed as env variables in the docker compose.

yes i have tried to nessie store metadata to postgres. but after docker compose down and docker compose up then in dremio there is a nessie list but cant show data in dremio. my docker-compose.yml is this and what is the problem in this docker-compose.yml file? please try my docker-compose.yml file and fix it for me.
i hope your help will also help the future users of dremio. @AlexMerced @balaji.ramaswamy

#version: '3.8'

services:
  # Minio Storage Server
  minio:
    image: minio/minio
    container_name: minio
    restart: always
    environment:
      - MINIO_ROOT_USER=admin
      - MINIO_ROOT_PASSWORD=password
      - MINIO_DOMAIN=minio
      - MINIO_REGION_NAME=us-east-1
      - MINIO_REGION=us-east-1
    ports:
      - "9000:9000"
      - "9001:9001"
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
      interval: 30s
      timeout: 20s
      retries: 3
    volumes:
      - ./minio_data:/minio-data
    entrypoint: >
      /bin/sh -c "
      chmod -R 777 postgres-data/;
      chmod -R 777 dremio-data/;
      minio server /data --console-address ':9001' &
      sleep 5;
      mc alias set myminio http://localhost:9000 admin password;
      mc mb myminio/datalake;
      mc mb myminio/datalakehouse;
      mc mb myminio/warehouse;
      mc mb myminio/seed;
      mc cp /minio-data/* myminio/seed/;
      tail -f /dev/null"
    networks:
      - data_lake_network

  # Spark configuration
  spark:
    image: alexmerced/spark35nb:latest
    restart: always
    ports: 
      - 8080:8080
      - 7077:7077
      - 8081:8081
      - 4040-4045:4040-4045
      - 18080:18080
      - 8888:8888
    environment:
      - AWS_REGION=us-east-1
      - AWS_ACCESS_KEY_ID=admin
      - AWS_SECRET_ACCESS_KEY=password
      - SPARK_MASTER_HOST=spark
      - SPARK_MASTER_PORT=7077
      - SPARK_MASTER_WEBUI_PORT=8080
      - SPARK_WORKER_WEBUI_PORT=8081
      - SPARK_HISTORY_OPTS=-Dspark.history.fs.logDirectory=/tmp/spark-events
      - SPARK_HOME=/opt/spark
    volumes:
      - ./notebook-seed:/workspace/seed-data
    container_name: spark
    entrypoint: >
      /bin/bash -c "
      /opt/spark/sbin/start-master.sh && \
      /opt/spark/sbin/start-worker.sh spark://$(hostname):7077 && \
      mkdir -p /tmp/spark-events && \
      start-history-server.sh && \
      jupyter lab --ip=0.0.0.0 --port=8888 --no-browser --allow-root --NotebookApp.token='' --NotebookApp.password='' && \
      tail -f /dev/null
      "
    networks:
      - data_lake_network

  # PostgreSQL Database for Nessie
  postgres:
    image: postgres:17
    container_name: nessie-postgres
    restart: always
    environment:
      - POSTGRES_USER=nessie
      - POSTGRES_PASSWORD=nessie_password
      - POSTGRES_DB=nessie
    # user: "${UID}:${GID}"
    volumes:
      - ./postgres-data:/var/lib/postgresql/data:rw  # Add :rw to ensure read-write permissions
    ports:
      - "5432:5432"
    networks:
      - data_lake_network
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U nessie"]
      interval: 10s
      timeout: 5s
      retries: 5

  # Nessie Catalog Server
  nessie:
    image: projectnessie/nessie:latest
    container_name: nessie
    restart: always
    depends_on:
      postgres:
        condition: service_healthy
    environment:
      - NESSIE_VERSION_STORE_TYPE=JDBC
      - QUARKUS_DATASOURCE_JDBC_URL=jdbc:postgresql://postgres:5432/nessie
      - QUARKUS_DATASOURCE_USERNAME=nessie
      - QUARKUS_DATASOURCE_PASSWORD=nessie_password
      - QUARKUS_PROFILE=prod
      - QUARKUS_HTTP_PORT=19120
      - QUARKUS_LOG_CONSOLE_FORMAT=%d{yyyy-MM-dd HH:mm:ss} %-5p [%c{1.}] (%t) %s%e%n
      - QUARKUS_LOG_LEVEL=INFO
      - NESSIE_VERSION_STORE_GARBAGE_COLLECTION_ENABLED=false  # Disable garbage collection temporarily
      - NESSIE_VERSION_STORE_RETENTION_DURATION=P30D  # Keep history for 30 days
    ports:
      - "19120:19120"
    networks:
      - data_lake_network
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:19120/api/v1/health"]
      interval: 30s
      timeout: 10s
      retries: 3

  # Dremio
  dremio:
    platform: linux/x86_64
    image: dremio/dremio-oss:latest
    ports:
      - 9047:9047
      - 31010:31010
      - 32010:32010
      - 45678:45678
    container_name: dremio
    environment:
      - DREMIO_JAVA_SERVER_EXTRA_OPTS=-Dpaths.dist=file:///opt/dremio/data/dist
      - dremio.iceberg.enabled=true
      - dremio.execution.support_unlimited_splits=true
      # Add these configurations
      - DREMIO_JAVA_EXTRA_OPTS=-Dfs.s3a.path.style.access=true -Dfs.s3a.endpoint=http://minio:9000
      - AWS_ACCESS_KEY_ID=admin
      - AWS_SECRET_ACCESS_KEY=password
      - DREMIO_JAVA_SERVER_EXTRA_OPTS=-Ddremio.nessie.defaultBranch=main -Ddremio.nessie.authentication.enabled=false
      - dremio.nessie.client.readTimeout=30000
      - dremio.nessie.client.writeTimeout=30000
    networks:
      - data_lake_network
    volumes:
      - ./dremio_data:/opt/dremio/data:rw

volumes:
  postgres-data:
    driver: local
  minio_data:
    driver: local
  nessie_data:
    driver: local
  spark_data:
    driver: local
  spark_worker_data:
    driver: local
  dremio_data:
    driver: local
  jupyter_data:
    driver: local
  notebook-seed:
    driver: local

networks:
  data_lake_network:
    driver: bridge