fix(docker): Rewrite Dockerfile to fix broken builds

Fixes #1550 - Docker builds were broken after PR #1535 switched from
vendored GPAC to system GPAC.

Changes:
- Switch from Alpine to Debian Bookworm (Alpine's musl libc has issues
  with Rust bindgen's libclang dynamic loading)
- Support three build variants via BUILD_TYPE argument:
  - minimal: No OCR support
  - ocr (default): Tesseract OCR for bitmap subtitles
  - hardsubx: OCR + FFmpeg for burned-in subtitle extraction
- Support dual source modes via USE_LOCAL_SOURCE argument:
  - 0 (default): Clone from GitHub (standalone Dockerfile)
  - 1: Use local source (faster for developers)
- Add .dockerignore to exclude build artifacts (~2.7GB -> ~900KB context)
- Update README.md with comprehensive build instructions

Tested all three variants successfully:
- minimal: ~130MB image
- ocr: ~215MB image
- hardsubx: ~610MB image

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Carlos Fernandez
2025-12-20 17:26:27 +01:00
parent 5f0c6728bf
commit 3f45a4e136
4 changed files with 342 additions and 82 deletions

37
.dockerignore Normal file
View File

@@ -0,0 +1,37 @@
# Build artifacts
linux/ccextractor
linux/rust/
linux/*.o
linux/*.a
mac/ccextractor
mac/rust/
build/
build_*/
# Git
.git/
.github/
# IDE
.vscode/
.idea/
*.swp
*.swo
# Docker
docker/
# Documentation (not needed for build)
docs/
*.md
!README.md
# Test files
*.ts
*.mp4
*.mkv
*.srt
*.vtt
# Plans
plans/

239
docker/Dockerfile Normal file
View File

@@ -0,0 +1,239 @@
# CCExtractor Docker Build
#
# Build variants via BUILD_TYPE argument:
# - minimal: Basic CCExtractor without OCR
# - ocr: CCExtractor with OCR support (default)
# - hardsubx: CCExtractor with burned-in subtitle extraction (requires FFmpeg)
#
# Source options via USE_LOCAL_SOURCE argument:
# - 0 (default): Clone from GitHub (standalone Dockerfile usage)
# - 1: Use local source (when building from cloned repo)
#
# Build examples:
#
# # Standalone (just the Dockerfile, clones from GitHub):
# docker build -t ccextractor docker/
# docker build --build-arg BUILD_TYPE=hardsubx -t ccextractor docker/
#
# # From cloned repository (faster, uses local source):
# docker build --build-arg USE_LOCAL_SOURCE=1 -f docker/Dockerfile -t ccextractor .
# docker build --build-arg USE_LOCAL_SOURCE=1 --build-arg BUILD_TYPE=minimal -f docker/Dockerfile -t ccextractor .
ARG DEBIAN_VERSION=bookworm-slim
FROM debian:${DEBIAN_VERSION} AS base
FROM base AS builder
# Build arguments
ARG BUILD_TYPE=ocr
ARG USE_LOCAL_SOURCE=0
# BUILD_TYPE: minimal, ocr, hardsubx
# USE_LOCAL_SOURCE: 0 = git clone, 1 = copy local source
# Avoid interactive prompts during package installation
ENV DEBIAN_FRONTEND=noninteractive
# Install base build dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
git \
curl \
ca-certificates \
gcc \
g++ \
cmake \
make \
pkg-config \
bash \
zlib1g-dev \
libpng-dev \
libjpeg-dev \
libssl-dev \
libfreetype-dev \
libxml2-dev \
libcurl4-gnutls-dev \
clang \
libclang-dev \
&& rm -rf /var/lib/apt/lists/*
# Install Rust toolchain
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable
ENV PATH="/root/.cargo/bin:${PATH}"
# Install OCR dependencies (for ocr and hardsubx builds)
RUN if [ "$BUILD_TYPE" = "ocr" ] || [ "$BUILD_TYPE" = "hardsubx" ]; then \
apt-get update && apt-get install -y --no-install-recommends \
tesseract-ocr \
libtesseract-dev \
libleptonica-dev \
&& rm -rf /var/lib/apt/lists/*; \
fi
# Install FFmpeg dependencies (for hardsubx build)
RUN if [ "$BUILD_TYPE" = "hardsubx" ]; then \
apt-get update && apt-get install -y --no-install-recommends \
libavcodec-dev \
libavformat-dev \
libavutil-dev \
libswscale-dev \
libswresample-dev \
libavfilter-dev \
libavdevice-dev \
&& rm -rf /var/lib/apt/lists/*; \
fi
# Build and install GPAC library
WORKDIR /root
RUN git clone -b v2.4.0 --depth 1 https://github.com/gpac/gpac
WORKDIR /root/gpac
RUN ./configure && make -j$(nproc) lib && make install-lib && ldconfig
WORKDIR /root
RUN rm -rf /root/gpac
# Get CCExtractor source (either clone or copy based on USE_LOCAL_SOURCE)
WORKDIR /root
# First, copy local source if provided (will be empty dir if building standalone)
COPY . /root/ccextractor-local/
# Then get source: use local copy if USE_LOCAL_SOURCE=1 and source exists,
# otherwise clone from GitHub
RUN if [ "$USE_LOCAL_SOURCE" = "1" ] && [ -f /root/ccextractor-local/src/ccextractor.c ]; then \
echo "Using local source"; \
mv /root/ccextractor-local /root/ccextractor; \
else \
echo "Cloning from GitHub"; \
rm -rf /root/ccextractor-local; \
git clone --depth 1 https://github.com/CCExtractor/ccextractor.git /root/ccextractor; \
fi
WORKDIR /root/ccextractor/linux
# Generate build info
RUN ./pre-build.sh
# Build Rust library with appropriate features
RUN if [ "$BUILD_TYPE" = "hardsubx" ]; then \
cd ../src/rust && \
CARGO_TARGET_DIR=../../linux/rust cargo build --release --features hardsubx_ocr; \
else \
cd ../src/rust && \
CARGO_TARGET_DIR=../../linux/rust cargo build --release; \
fi
RUN cp rust/release/libccx_rust.a ./libccx_rust.a
# Compile CCExtractor
RUN if [ "$BUILD_TYPE" = "minimal" ]; then \
BLD_FLAGS="-std=gnu99 -Wno-write-strings -Wno-pointer-sign -D_FILE_OFFSET_BITS=64 -DVERSION_FILE_PRESENT -DFT2_BUILD_LIBRARY -DGPAC_DISABLE_VTT -DGPAC_DISABLE_OD_DUMP -DGPAC_DISABLE_REMOTERY -DNO_GZIP -DGPAC_64_BITS"; \
BLD_INCLUDE="-I../src -I../src/lib_ccx/ -I /usr/include/gpac/ -I../src/thirdparty/libpng -I../src/thirdparty/zlib -I../src/lib_ccx/zvbi -I../src/thirdparty/lib_hash -I../src/thirdparty -I../src/thirdparty/freetype/include"; \
BLD_LINKER="-lm -Wl,--allow-multiple-definition -lpthread -ldl -lgpac ./libccx_rust.a"; \
elif [ "$BUILD_TYPE" = "hardsubx" ]; then \
BLD_FLAGS="-std=gnu99 -Wno-write-strings -Wno-pointer-sign -D_FILE_OFFSET_BITS=64 -DVERSION_FILE_PRESENT -DENABLE_OCR -DENABLE_HARDSUBX -DFT2_BUILD_LIBRARY -DGPAC_DISABLE_VTT -DGPAC_DISABLE_OD_DUMP -DGPAC_DISABLE_REMOTERY -DNO_GZIP -DGPAC_64_BITS"; \
BLD_INCLUDE="-I../src -I /usr/include/leptonica/ -I /usr/include/tesseract/ -I../src/lib_ccx/ -I /usr/include/gpac/ -I../src/thirdparty/libpng -I../src/thirdparty/zlib -I../src/lib_ccx/zvbi -I../src/thirdparty/lib_hash -I../src/thirdparty -I../src/thirdparty/freetype/include"; \
BLD_LINKER="-lm -Wl,--allow-multiple-definition -ltesseract -lleptonica -lpthread -ldl -lgpac -lswscale -lavutil -lavformat -lavcodec -lavfilter -lswresample ./libccx_rust.a"; \
else \
BLD_FLAGS="-std=gnu99 -Wno-write-strings -Wno-pointer-sign -D_FILE_OFFSET_BITS=64 -DVERSION_FILE_PRESENT -DENABLE_OCR -DFT2_BUILD_LIBRARY -DGPAC_DISABLE_VTT -DGPAC_DISABLE_OD_DUMP -DGPAC_DISABLE_REMOTERY -DNO_GZIP -DGPAC_64_BITS"; \
BLD_INCLUDE="-I../src -I /usr/include/leptonica/ -I /usr/include/tesseract/ -I../src/lib_ccx/ -I /usr/include/gpac/ -I../src/thirdparty/libpng -I../src/thirdparty/zlib -I../src/lib_ccx/zvbi -I../src/thirdparty/lib_hash -I../src/thirdparty -I../src/thirdparty/freetype/include"; \
BLD_LINKER="-lm -Wl,--allow-multiple-definition -ltesseract -lleptonica -lpthread -ldl -lgpac ./libccx_rust.a"; \
fi && \
SRC_LIBPNG="$(find ../src/thirdparty/libpng/ -name '*.c')" && \
SRC_ZLIB="$(find ../src/thirdparty/zlib/ -name '*.c')" && \
SRC_CCX="$(find ../src/lib_ccx/ -name '*.c')" && \
SRC_GPAC="$(find /usr/include/gpac/ -name '*.c' 2>/dev/null || true)" && \
SRC_HASH="$(find ../src/thirdparty/lib_hash/ -name '*.c')" && \
SRC_UTF8PROC="../src/thirdparty/utf8proc/utf8proc.c" && \
SRC_FREETYPE="../src/thirdparty/freetype/autofit/autofit.c \
../src/thirdparty/freetype/base/ftbase.c \
../src/thirdparty/freetype/base/ftbbox.c \
../src/thirdparty/freetype/base/ftbdf.c \
../src/thirdparty/freetype/base/ftbitmap.c \
../src/thirdparty/freetype/base/ftcid.c \
../src/thirdparty/freetype/base/ftfntfmt.c \
../src/thirdparty/freetype/base/ftfstype.c \
../src/thirdparty/freetype/base/ftgasp.c \
../src/thirdparty/freetype/base/ftglyph.c \
../src/thirdparty/freetype/base/ftgxval.c \
../src/thirdparty/freetype/base/ftinit.c \
../src/thirdparty/freetype/base/ftlcdfil.c \
../src/thirdparty/freetype/base/ftmm.c \
../src/thirdparty/freetype/base/ftotval.c \
../src/thirdparty/freetype/base/ftpatent.c \
../src/thirdparty/freetype/base/ftpfr.c \
../src/thirdparty/freetype/base/ftstroke.c \
../src/thirdparty/freetype/base/ftsynth.c \
../src/thirdparty/freetype/base/ftsystem.c \
../src/thirdparty/freetype/base/fttype1.c \
../src/thirdparty/freetype/base/ftwinfnt.c \
../src/thirdparty/freetype/bdf/bdf.c \
../src/thirdparty/freetype/bzip2/ftbzip2.c \
../src/thirdparty/freetype/cache/ftcache.c \
../src/thirdparty/freetype/cff/cff.c \
../src/thirdparty/freetype/cid/type1cid.c \
../src/thirdparty/freetype/gzip/ftgzip.c \
../src/thirdparty/freetype/lzw/ftlzw.c \
../src/thirdparty/freetype/pcf/pcf.c \
../src/thirdparty/freetype/pfr/pfr.c \
../src/thirdparty/freetype/psaux/psaux.c \
../src/thirdparty/freetype/pshinter/pshinter.c \
../src/thirdparty/freetype/psnames/psnames.c \
../src/thirdparty/freetype/raster/raster.c \
../src/thirdparty/freetype/sfnt/sfnt.c \
../src/thirdparty/freetype/smooth/smooth.c \
../src/thirdparty/freetype/truetype/truetype.c \
../src/thirdparty/freetype/type1/type1.c \
../src/thirdparty/freetype/type42/type42.c \
../src/thirdparty/freetype/winfonts/winfnt.c" && \
BLD_SOURCES="../src/ccextractor.c $SRC_CCX $SRC_GPAC $SRC_ZLIB $SRC_LIBPNG $SRC_HASH $SRC_UTF8PROC $SRC_FREETYPE" && \
gcc $BLD_FLAGS $BLD_INCLUDE -o ccextractor $BLD_SOURCES $BLD_LINKER
# Copy binary to known location
RUN cp /root/ccextractor/linux/ccextractor /ccextractor
# Final minimal image
FROM base AS final
ARG BUILD_TYPE=ocr
# Avoid interactive prompts
ENV DEBIAN_FRONTEND=noninteractive
# Install runtime dependencies based on build type
RUN apt-get update && apt-get install -y --no-install-recommends \
libpng16-16 \
libjpeg62-turbo \
zlib1g \
libssl3 \
libcurl4 \
&& rm -rf /var/lib/apt/lists/*
# OCR runtime dependencies
RUN if [ "$BUILD_TYPE" = "ocr" ] || [ "$BUILD_TYPE" = "hardsubx" ]; then \
apt-get update && apt-get install -y --no-install-recommends \
tesseract-ocr \
liblept5 \
&& rm -rf /var/lib/apt/lists/*; \
fi
# HardSubX runtime dependencies
RUN if [ "$BUILD_TYPE" = "hardsubx" ]; then \
apt-get update && apt-get install -y --no-install-recommends \
libavcodec59 \
libavformat59 \
libavutil57 \
libswscale6 \
libswresample4 \
libavfilter8 \
libavdevice59 \
&& rm -rf /var/lib/apt/lists/*; \
fi
# Copy GPAC library from builder
COPY --from=builder /usr/local/lib/libgpac.so* /usr/local/lib/
# Update library cache
RUN ldconfig
# Copy CCExtractor binary
COPY --from=builder /ccextractor /ccextractor
ENTRYPOINT ["/ccextractor"]

View File

@@ -1,61 +1,91 @@
# CCExtractor Docker image
# CCExtractor Docker Image
This dockerfile prepares a minimalist Docker image with CCExtractor. It compiles CCExtractor from sources following instructions from the [Compilation Guide](https://github.com/CCExtractor/ccextractor/blob/master/docs/COMPILATION.MD).
This Dockerfile builds CCExtractor with support for multiple build variants.
You can install the latest build of this image by running `docker pull CCExtractor/ccextractor`
## Build Variants
## Build
| Variant | Description | Features |
|---------|-------------|----------|
| `minimal` | Basic CCExtractor | No OCR support |
| `ocr` | With OCR support (default) | Tesseract OCR for bitmap subtitles |
| `hardsubx` | With burned-in subtitle extraction | OCR + FFmpeg for hardcoded subtitles |
You can build the Docker image directly from the Dockerfile provided in [docker](https://github.com/CCExtractor/ccextractor/tree/master/docker) directory of CCExtractor source
## Building
### Standalone Build (from Dockerfile only)
You can build CCExtractor using just the Dockerfile - it will clone the source from GitHub:
```bash
$ git clone https://github.com/CCExtractor/ccextractor.git && cd ccextractor
$ cd docker/
$ docker build -t ccextractor .
# Default build (OCR enabled)
docker build -t ccextractor docker/
# Minimal build (no OCR)
docker build --build-arg BUILD_TYPE=minimal -t ccextractor docker/
# HardSubX build (OCR + FFmpeg for burned-in subtitles)
docker build --build-arg BUILD_TYPE=hardsubx -t ccextractor docker/
```
### Build from Cloned Repository (faster)
If you have already cloned the repository, you can use local source for faster builds:
```bash
git clone https://github.com/CCExtractor/ccextractor.git
cd ccextractor
# Default build (OCR enabled)
docker build --build-arg USE_LOCAL_SOURCE=1 -f docker/Dockerfile -t ccextractor .
# Minimal build
docker build --build-arg USE_LOCAL_SOURCE=1 --build-arg BUILD_TYPE=minimal -f docker/Dockerfile -t ccextractor .
# HardSubX build
docker build --build-arg USE_LOCAL_SOURCE=1 --build-arg BUILD_TYPE=hardsubx -f docker/Dockerfile -t ccextractor .
```
## Build Arguments
| Argument | Default | Description |
|----------|---------|-------------|
| `BUILD_TYPE` | `ocr` | Build variant: `minimal`, `ocr`, or `hardsubx` |
| `USE_LOCAL_SOURCE` | `0` | Set to `1` to use local source instead of cloning |
| `DEBIAN_VERSION` | `bookworm-slim` | Debian version to use as base |
## Usage
The CCExtractor Docker image can be used in several ways, depending on your needs.
### Basic Usage
```bash
# General usage
$ docker run ccextractor:latest <features>
# Show version
docker run --rm ccextractor --version
# Show help
docker run --rm ccextractor --help
```
1. Process a local file & use `-o` flag
### Processing Local Files
To process a local video file, mount a directory containing the input file inside the container:
Mount your local directory to process files:
```bash
# Use `-o` to specifying output file
$ docker run --rm -v $(pwd):$(pwd) -w $(pwd) ccextractor:latest input.mp4 -o output.srt
# Process a video file with output file
docker run --rm -v $(pwd):$(pwd) -w $(pwd) ccextractor input.mp4 -o output.srt
# Alternatively use `--stdout` feature
$ docker run --rm -v $(pwd):$(pwd) -w $(pwd) ccextractor:latest input.mp4 --stdout > output.srt
# Process using stdout
docker run --rm -v $(pwd):$(pwd) -w $(pwd) ccextractor input.mp4 --stdout > output.srt
```
Run this command from where your input video file is present, and change `input.mp4` & `output.srt` with the actual name of files.
2. Enter an interactive environment
If you need to run CCExtractor with additional options or perform other tasks within the container, you can enter an interactive environment:
bash
### Interactive Mode
```bash
$ docker run --rm -it --entrypoint='sh' ccextractor:latest
docker run --rm -it --entrypoint=/bin/bash ccextractor
```
This will start a Bash shell inside the container, allowing you to run CCExtractor commands manually or perform other operations.
## Image Size
### Example
I run help command in image built from `dockerfile`
```bash
$ docker build -t ccextractor .
$ docker run --rm ccextractor:latest --help
```
This will show the `--help` message of CCExtractor tool
From there you can see all the features and flags which can be used.
The multi-stage build produces runtime images:
- `minimal`: ~130MB
- `ocr`: ~215MB (includes Tesseract)
- `hardsubx`: ~610MB (includes Tesseract + FFmpeg)

View File

@@ -1,46 +0,0 @@
FROM alpine:latest as base
FROM base as builder
RUN apk add --no-cache --update git curl gcc cmake glew glfw \
tesseract-ocr-dev leptonica-dev clang-dev llvm-dev make pkgconfig \
zlib-dev libpng-dev libjpeg-turbo-dev openssl-dev freetype-dev libxml2-dev bash cargo
WORKDIR /root
RUN git clone -b v2.4.0 https://github.com/gpac/gpac
WORKDIR /root/gpac/
RUN ./configure && make -j$(nproc) && make install-lib
WORKDIR /root
RUN rm -rf /root/gpac
RUN git clone https://github.com/CCExtractor/ccextractor.git
WORKDIR /root/ccextractor/linux
RUN ./pre-build.sh && ./build
RUN cp /root/ccextractor/linux/ccextractor /ccextractor && rm -rf ~/ccextractor
FROM base as final
COPY --from=builder /lib/ld-musl-x86_64.so.1 /lib/
COPY --from=builder /usr/lib/libtesseract.so.5 /usr/lib/
COPY --from=builder /usr/lib/libleptonica.so.6 /usr/lib/
COPY --from=builder /usr/local/lib/libgpac.so.12 /usr/local/lib/
COPY --from=builder /usr/lib/libstdc++.so.6 /usr/lib/
COPY --from=builder /usr/lib/libgcc_s.so.1 /usr/lib/
COPY --from=builder /usr/lib/libgomp.so.1 /usr/lib/
COPY --from=builder /usr/lib/libpng16.so.16 /usr/lib/
COPY --from=builder /usr/lib/libjpeg.so.8 /usr/lib/
COPY --from=builder /usr/lib/libgif.so.7 /usr/lib/
COPY --from=builder /usr/lib/libtiff.so.6 /usr/lib/
COPY --from=builder /usr/lib/libwebp.so.7 /usr/lib/
COPY --from=builder /usr/lib/libwebpmux.so.3 /usr/lib/
COPY --from=builder /usr/lib/libz.so.1 /lib/
COPY --from=builder /usr/lib/libssl.so.3 /lib/
COPY --from=builder /usr/lib/libcrypto.so.3 /lib/
COPY --from=builder /usr/lib/liblzma.so.5 /usr/lib/
COPY --from=builder /usr/lib/libzstd.so.1 /usr/lib/
COPY --from=builder /usr/lib/libsharpyuv.so.0 /usr/lib/
COPY --from=builder /ccextractor /
ENTRYPOINT [ "/ccextractor" ]