i followed this tutorial: Hands-on with Apache Iceberg on Your LapDeep Dive with Apache Spark, Nessie, Minio, Dremio, Polars and Seaborn
but it don’t store data. it stores data on Memory.
So how to store data on disk?
i mean after docker compose down then docker compose up then all data erased.
so how to configure it on docker-compose.yml file?
@ganbaatar What command do you use to bring down Dremio, from the document @AlexMerced wrote I can see 2 options
Spinning Down the Environment
To stop and remove all running services, use the following command:
- docker-compose down
This stops the services and removes the associated containers, networks, and volumes. Your data will still be preserved in the mounted volumes, so any changes made to your data (such as in Minio or Nessie) will remain intact the next time you spin up the environment.
You can also use the following flags with docker-compose down:
–volumes: This flag will remove all the associated volumes as well. Use this if you want to completely clean up the environment, including any persisted data in volumes.
You can also consult the Nessie documentation at projectnessie.org to configure a backing store like Postgres or mongo
These would be passed as env variables in the docker compose.
yes i have tried to nessie store metadata to postgres. but after docker compose down and docker compose up then in dremio there is a nessie list but cant show data in dremio. my docker-compose.yml is this and what is the problem in this docker-compose.yml file? please try my docker-compose.yml file and fix it for me.
i hope your help will also help the future users of dremio. @AlexMerced @balaji.ramaswamy
#version: '3.8'
services:
# Minio Storage Server
minio:
image: minio/minio
container_name: minio
restart: always
environment:
- MINIO_ROOT_USER=admin
- MINIO_ROOT_PASSWORD=password
- MINIO_DOMAIN=minio
- MINIO_REGION_NAME=us-east-1
- MINIO_REGION=us-east-1
ports:
- "9000:9000"
- "9001:9001"
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
interval: 30s
timeout: 20s
retries: 3
volumes:
- ./minio_data:/minio-data
entrypoint: >
/bin/sh -c "
chmod -R 777 postgres-data/;
chmod -R 777 dremio-data/;
minio server /data --console-address ':9001' &
sleep 5;
mc alias set myminio http://localhost:9000 admin password;
mc mb myminio/datalake;
mc mb myminio/datalakehouse;
mc mb myminio/warehouse;
mc mb myminio/seed;
mc cp /minio-data/* myminio/seed/;
tail -f /dev/null"
networks:
- data_lake_network
# Spark configuration
spark:
image: alexmerced/spark35nb:latest
restart: always
ports:
- 8080:8080
- 7077:7077
- 8081:8081
- 4040-4045:4040-4045
- 18080:18080
- 8888:8888
environment:
- AWS_REGION=us-east-1
- AWS_ACCESS_KEY_ID=admin
- AWS_SECRET_ACCESS_KEY=password
- SPARK_MASTER_HOST=spark
- SPARK_MASTER_PORT=7077
- SPARK_MASTER_WEBUI_PORT=8080
- SPARK_WORKER_WEBUI_PORT=8081
- SPARK_HISTORY_OPTS=-Dspark.history.fs.logDirectory=/tmp/spark-events
- SPARK_HOME=/opt/spark
volumes:
- ./notebook-seed:/workspace/seed-data
container_name: spark
entrypoint: >
/bin/bash -c "
/opt/spark/sbin/start-master.sh && \
/opt/spark/sbin/start-worker.sh spark://$(hostname):7077 && \
mkdir -p /tmp/spark-events && \
start-history-server.sh && \
jupyter lab --ip=0.0.0.0 --port=8888 --no-browser --allow-root --NotebookApp.token='' --NotebookApp.password='' && \
tail -f /dev/null
"
networks:
- data_lake_network
# PostgreSQL Database for Nessie
postgres:
image: postgres:17
container_name: nessie-postgres
restart: always
environment:
- POSTGRES_USER=nessie
- POSTGRES_PASSWORD=nessie_password
- POSTGRES_DB=nessie
# user: "${UID}:${GID}"
volumes:
- ./postgres-data:/var/lib/postgresql/data:rw # Add :rw to ensure read-write permissions
ports:
- "5432:5432"
networks:
- data_lake_network
healthcheck:
test: ["CMD-SHELL", "pg_isready -U nessie"]
interval: 10s
timeout: 5s
retries: 5
# Nessie Catalog Server
nessie:
image: projectnessie/nessie:latest
container_name: nessie
restart: always
depends_on:
postgres:
condition: service_healthy
environment:
- NESSIE_VERSION_STORE_TYPE=JDBC
- QUARKUS_DATASOURCE_JDBC_URL=jdbc:postgresql://postgres:5432/nessie
- QUARKUS_DATASOURCE_USERNAME=nessie
- QUARKUS_DATASOURCE_PASSWORD=nessie_password
- QUARKUS_PROFILE=prod
- QUARKUS_HTTP_PORT=19120
- QUARKUS_LOG_CONSOLE_FORMAT=%d{yyyy-MM-dd HH:mm:ss} %-5p [%c{1.}] (%t) %s%e%n
- QUARKUS_LOG_LEVEL=INFO
- NESSIE_VERSION_STORE_GARBAGE_COLLECTION_ENABLED=false # Disable garbage collection temporarily
- NESSIE_VERSION_STORE_RETENTION_DURATION=P30D # Keep history for 30 days
ports:
- "19120:19120"
networks:
- data_lake_network
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:19120/api/v1/health"]
interval: 30s
timeout: 10s
retries: 3
# Dremio
dremio:
platform: linux/x86_64
image: dremio/dremio-oss:latest
ports:
- 9047:9047
- 31010:31010
- 32010:32010
- 45678:45678
container_name: dremio
environment:
- DREMIO_JAVA_SERVER_EXTRA_OPTS=-Dpaths.dist=file:///opt/dremio/data/dist
- dremio.iceberg.enabled=true
- dremio.execution.support_unlimited_splits=true
# Add these configurations
- DREMIO_JAVA_EXTRA_OPTS=-Dfs.s3a.path.style.access=true -Dfs.s3a.endpoint=http://minio:9000
- AWS_ACCESS_KEY_ID=admin
- AWS_SECRET_ACCESS_KEY=password
- DREMIO_JAVA_SERVER_EXTRA_OPTS=-Ddremio.nessie.defaultBranch=main -Ddremio.nessie.authentication.enabled=false
- dremio.nessie.client.readTimeout=30000
- dremio.nessie.client.writeTimeout=30000
networks:
- data_lake_network
volumes:
- ./dremio_data:/opt/dremio/data:rw
volumes:
postgres-data:
driver: local
minio_data:
driver: local
nessie_data:
driver: local
spark_data:
driver: local
spark_worker_data:
driver: local
dremio_data:
driver: local
jupyter_data:
driver: local
notebook-seed:
driver: local
networks:
data_lake_network:
driver: bridge