diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000..cff3dcd4 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,37 @@ +# Build artifacts +linux/ccextractor +linux/rust/ +linux/*.o +linux/*.a +mac/ccextractor +mac/rust/ +build/ +build_*/ + +# Git +.git/ +.github/ + +# IDE +.vscode/ +.idea/ +*.swp +*.swo + +# Docker +docker/ + +# Documentation (not needed for build) +docs/ +*.md +!README.md + +# Test files +*.ts +*.mp4 +*.mkv +*.srt +*.vtt + +# Plans +plans/ diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 00000000..2054968f --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,239 @@ +# CCExtractor Docker Build +# +# Build variants via BUILD_TYPE argument: +# - minimal: Basic CCExtractor without OCR +# - ocr: CCExtractor with OCR support (default) +# - hardsubx: CCExtractor with burned-in subtitle extraction (requires FFmpeg) +# +# Source options via USE_LOCAL_SOURCE argument: +# - 0 (default): Clone from GitHub (standalone Dockerfile usage) +# - 1: Use local source (when building from cloned repo) +# +# Build examples: +# +# # Standalone (just the Dockerfile, clones from GitHub): +# docker build -t ccextractor docker/ +# docker build --build-arg BUILD_TYPE=hardsubx -t ccextractor docker/ +# +# # From cloned repository (faster, uses local source): +# docker build --build-arg USE_LOCAL_SOURCE=1 -f docker/Dockerfile -t ccextractor . +# docker build --build-arg USE_LOCAL_SOURCE=1 --build-arg BUILD_TYPE=minimal -f docker/Dockerfile -t ccextractor . + +ARG DEBIAN_VERSION=bookworm-slim + +FROM debian:${DEBIAN_VERSION} AS base + +FROM base AS builder + +# Build arguments +ARG BUILD_TYPE=ocr +ARG USE_LOCAL_SOURCE=0 +# BUILD_TYPE: minimal, ocr, hardsubx +# USE_LOCAL_SOURCE: 0 = git clone, 1 = copy local source + +# Avoid interactive prompts during package installation +ENV DEBIAN_FRONTEND=noninteractive + +# Install base build dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + git \ + curl \ + ca-certificates \ + gcc \ + g++ \ + cmake \ + make \ + pkg-config \ + bash \ + zlib1g-dev \ + libpng-dev \ + libjpeg-dev \ + libssl-dev \ + libfreetype-dev \ + libxml2-dev \ + libcurl4-gnutls-dev \ + clang \ + libclang-dev \ + && rm -rf /var/lib/apt/lists/* + +# Install Rust toolchain +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable +ENV PATH="/root/.cargo/bin:${PATH}" + +# Install OCR dependencies (for ocr and hardsubx builds) +RUN if [ "$BUILD_TYPE" = "ocr" ] || [ "$BUILD_TYPE" = "hardsubx" ]; then \ + apt-get update && apt-get install -y --no-install-recommends \ + tesseract-ocr \ + libtesseract-dev \ + libleptonica-dev \ + && rm -rf /var/lib/apt/lists/*; \ + fi + +# Install FFmpeg dependencies (for hardsubx build) +RUN if [ "$BUILD_TYPE" = "hardsubx" ]; then \ + apt-get update && apt-get install -y --no-install-recommends \ + libavcodec-dev \ + libavformat-dev \ + libavutil-dev \ + libswscale-dev \ + libswresample-dev \ + libavfilter-dev \ + libavdevice-dev \ + && rm -rf /var/lib/apt/lists/*; \ + fi + +# Build and install GPAC library +WORKDIR /root +RUN git clone -b v2.4.0 --depth 1 https://github.com/gpac/gpac +WORKDIR /root/gpac +RUN ./configure && make -j$(nproc) lib && make install-lib && ldconfig +WORKDIR /root +RUN rm -rf /root/gpac + +# Get CCExtractor source (either clone or copy based on USE_LOCAL_SOURCE) +WORKDIR /root +# First, copy local source if provided (will be empty dir if building standalone) +COPY . /root/ccextractor-local/ + +# Then get source: use local copy if USE_LOCAL_SOURCE=1 and source exists, +# otherwise clone from GitHub +RUN if [ "$USE_LOCAL_SOURCE" = "1" ] && [ -f /root/ccextractor-local/src/ccextractor.c ]; then \ + echo "Using local source"; \ + mv /root/ccextractor-local /root/ccextractor; \ + else \ + echo "Cloning from GitHub"; \ + rm -rf /root/ccextractor-local; \ + git clone --depth 1 https://github.com/CCExtractor/ccextractor.git /root/ccextractor; \ + fi + +WORKDIR /root/ccextractor/linux + +# Generate build info +RUN ./pre-build.sh + +# Build Rust library with appropriate features +RUN if [ "$BUILD_TYPE" = "hardsubx" ]; then \ + cd ../src/rust && \ + CARGO_TARGET_DIR=../../linux/rust cargo build --release --features hardsubx_ocr; \ + else \ + cd ../src/rust && \ + CARGO_TARGET_DIR=../../linux/rust cargo build --release; \ + fi + +RUN cp rust/release/libccx_rust.a ./libccx_rust.a + +# Compile CCExtractor +RUN if [ "$BUILD_TYPE" = "minimal" ]; then \ + BLD_FLAGS="-std=gnu99 -Wno-write-strings -Wno-pointer-sign -D_FILE_OFFSET_BITS=64 -DVERSION_FILE_PRESENT -DFT2_BUILD_LIBRARY -DGPAC_DISABLE_VTT -DGPAC_DISABLE_OD_DUMP -DGPAC_DISABLE_REMOTERY -DNO_GZIP -DGPAC_64_BITS"; \ + BLD_INCLUDE="-I../src -I../src/lib_ccx/ -I /usr/include/gpac/ -I../src/thirdparty/libpng -I../src/thirdparty/zlib -I../src/lib_ccx/zvbi -I../src/thirdparty/lib_hash -I../src/thirdparty -I../src/thirdparty/freetype/include"; \ + BLD_LINKER="-lm -Wl,--allow-multiple-definition -lpthread -ldl -lgpac ./libccx_rust.a"; \ + elif [ "$BUILD_TYPE" = "hardsubx" ]; then \ + BLD_FLAGS="-std=gnu99 -Wno-write-strings -Wno-pointer-sign -D_FILE_OFFSET_BITS=64 -DVERSION_FILE_PRESENT -DENABLE_OCR -DENABLE_HARDSUBX -DFT2_BUILD_LIBRARY -DGPAC_DISABLE_VTT -DGPAC_DISABLE_OD_DUMP -DGPAC_DISABLE_REMOTERY -DNO_GZIP -DGPAC_64_BITS"; \ + BLD_INCLUDE="-I../src -I /usr/include/leptonica/ -I /usr/include/tesseract/ -I../src/lib_ccx/ -I /usr/include/gpac/ -I../src/thirdparty/libpng -I../src/thirdparty/zlib -I../src/lib_ccx/zvbi -I../src/thirdparty/lib_hash -I../src/thirdparty -I../src/thirdparty/freetype/include"; \ + BLD_LINKER="-lm -Wl,--allow-multiple-definition -ltesseract -lleptonica -lpthread -ldl -lgpac -lswscale -lavutil -lavformat -lavcodec -lavfilter -lswresample ./libccx_rust.a"; \ + else \ + BLD_FLAGS="-std=gnu99 -Wno-write-strings -Wno-pointer-sign -D_FILE_OFFSET_BITS=64 -DVERSION_FILE_PRESENT -DENABLE_OCR -DFT2_BUILD_LIBRARY -DGPAC_DISABLE_VTT -DGPAC_DISABLE_OD_DUMP -DGPAC_DISABLE_REMOTERY -DNO_GZIP -DGPAC_64_BITS"; \ + BLD_INCLUDE="-I../src -I /usr/include/leptonica/ -I /usr/include/tesseract/ -I../src/lib_ccx/ -I /usr/include/gpac/ -I../src/thirdparty/libpng -I../src/thirdparty/zlib -I../src/lib_ccx/zvbi -I../src/thirdparty/lib_hash -I../src/thirdparty -I../src/thirdparty/freetype/include"; \ + BLD_LINKER="-lm -Wl,--allow-multiple-definition -ltesseract -lleptonica -lpthread -ldl -lgpac ./libccx_rust.a"; \ + fi && \ + SRC_LIBPNG="$(find ../src/thirdparty/libpng/ -name '*.c')" && \ + SRC_ZLIB="$(find ../src/thirdparty/zlib/ -name '*.c')" && \ + SRC_CCX="$(find ../src/lib_ccx/ -name '*.c')" && \ + SRC_GPAC="$(find /usr/include/gpac/ -name '*.c' 2>/dev/null || true)" && \ + SRC_HASH="$(find ../src/thirdparty/lib_hash/ -name '*.c')" && \ + SRC_UTF8PROC="../src/thirdparty/utf8proc/utf8proc.c" && \ + SRC_FREETYPE="../src/thirdparty/freetype/autofit/autofit.c \ + ../src/thirdparty/freetype/base/ftbase.c \ + ../src/thirdparty/freetype/base/ftbbox.c \ + ../src/thirdparty/freetype/base/ftbdf.c \ + ../src/thirdparty/freetype/base/ftbitmap.c \ + ../src/thirdparty/freetype/base/ftcid.c \ + ../src/thirdparty/freetype/base/ftfntfmt.c \ + ../src/thirdparty/freetype/base/ftfstype.c \ + ../src/thirdparty/freetype/base/ftgasp.c \ + ../src/thirdparty/freetype/base/ftglyph.c \ + ../src/thirdparty/freetype/base/ftgxval.c \ + ../src/thirdparty/freetype/base/ftinit.c \ + ../src/thirdparty/freetype/base/ftlcdfil.c \ + ../src/thirdparty/freetype/base/ftmm.c \ + ../src/thirdparty/freetype/base/ftotval.c \ + ../src/thirdparty/freetype/base/ftpatent.c \ + ../src/thirdparty/freetype/base/ftpfr.c \ + ../src/thirdparty/freetype/base/ftstroke.c \ + ../src/thirdparty/freetype/base/ftsynth.c \ + ../src/thirdparty/freetype/base/ftsystem.c \ + ../src/thirdparty/freetype/base/fttype1.c \ + ../src/thirdparty/freetype/base/ftwinfnt.c \ + ../src/thirdparty/freetype/bdf/bdf.c \ + ../src/thirdparty/freetype/bzip2/ftbzip2.c \ + ../src/thirdparty/freetype/cache/ftcache.c \ + ../src/thirdparty/freetype/cff/cff.c \ + ../src/thirdparty/freetype/cid/type1cid.c \ + ../src/thirdparty/freetype/gzip/ftgzip.c \ + ../src/thirdparty/freetype/lzw/ftlzw.c \ + ../src/thirdparty/freetype/pcf/pcf.c \ + ../src/thirdparty/freetype/pfr/pfr.c \ + ../src/thirdparty/freetype/psaux/psaux.c \ + ../src/thirdparty/freetype/pshinter/pshinter.c \ + ../src/thirdparty/freetype/psnames/psnames.c \ + ../src/thirdparty/freetype/raster/raster.c \ + ../src/thirdparty/freetype/sfnt/sfnt.c \ + ../src/thirdparty/freetype/smooth/smooth.c \ + ../src/thirdparty/freetype/truetype/truetype.c \ + ../src/thirdparty/freetype/type1/type1.c \ + ../src/thirdparty/freetype/type42/type42.c \ + ../src/thirdparty/freetype/winfonts/winfnt.c" && \ + BLD_SOURCES="../src/ccextractor.c $SRC_CCX $SRC_GPAC $SRC_ZLIB $SRC_LIBPNG $SRC_HASH $SRC_UTF8PROC $SRC_FREETYPE" && \ + gcc $BLD_FLAGS $BLD_INCLUDE -o ccextractor $BLD_SOURCES $BLD_LINKER + +# Copy binary to known location +RUN cp /root/ccextractor/linux/ccextractor /ccextractor + +# Final minimal image +FROM base AS final + +ARG BUILD_TYPE=ocr + +# Avoid interactive prompts +ENV DEBIAN_FRONTEND=noninteractive + +# Install runtime dependencies based on build type +RUN apt-get update && apt-get install -y --no-install-recommends \ + libpng16-16 \ + libjpeg62-turbo \ + zlib1g \ + libssl3 \ + libcurl4 \ + && rm -rf /var/lib/apt/lists/* + +# OCR runtime dependencies +RUN if [ "$BUILD_TYPE" = "ocr" ] || [ "$BUILD_TYPE" = "hardsubx" ]; then \ + apt-get update && apt-get install -y --no-install-recommends \ + tesseract-ocr \ + liblept5 \ + && rm -rf /var/lib/apt/lists/*; \ + fi + +# HardSubX runtime dependencies +RUN if [ "$BUILD_TYPE" = "hardsubx" ]; then \ + apt-get update && apt-get install -y --no-install-recommends \ + libavcodec59 \ + libavformat59 \ + libavutil57 \ + libswscale6 \ + libswresample4 \ + libavfilter8 \ + libavdevice59 \ + && rm -rf /var/lib/apt/lists/*; \ + fi + +# Copy GPAC library from builder +COPY --from=builder /usr/local/lib/libgpac.so* /usr/local/lib/ + +# Update library cache +RUN ldconfig + +# Copy CCExtractor binary +COPY --from=builder /ccextractor /ccextractor + +ENTRYPOINT ["/ccextractor"] diff --git a/docker/README.md b/docker/README.md index 60fdb744..d423b58c 100644 --- a/docker/README.md +++ b/docker/README.md @@ -1,61 +1,91 @@ -# CCExtractor Docker image +# CCExtractor Docker Image -This dockerfile prepares a minimalist Docker image with CCExtractor. It compiles CCExtractor from sources following instructions from the [Compilation Guide](https://github.com/CCExtractor/ccextractor/blob/master/docs/COMPILATION.MD). +This Dockerfile builds CCExtractor with support for multiple build variants. -You can install the latest build of this image by running `docker pull CCExtractor/ccextractor` +## Build Variants -## Build +| Variant | Description | Features | +|---------|-------------|----------| +| `minimal` | Basic CCExtractor | No OCR support | +| `ocr` | With OCR support (default) | Tesseract OCR for bitmap subtitles | +| `hardsubx` | With burned-in subtitle extraction | OCR + FFmpeg for hardcoded subtitles | -You can build the Docker image directly from the Dockerfile provided in [docker](https://github.com/CCExtractor/ccextractor/tree/master/docker) directory of CCExtractor source +## Building + +### Standalone Build (from Dockerfile only) + +You can build CCExtractor using just the Dockerfile - it will clone the source from GitHub: ```bash -$ git clone https://github.com/CCExtractor/ccextractor.git && cd ccextractor -$ cd docker/ -$ docker build -t ccextractor . +# Default build (OCR enabled) +docker build -t ccextractor docker/ + +# Minimal build (no OCR) +docker build --build-arg BUILD_TYPE=minimal -t ccextractor docker/ + +# HardSubX build (OCR + FFmpeg for burned-in subtitles) +docker build --build-arg BUILD_TYPE=hardsubx -t ccextractor docker/ ``` +### Build from Cloned Repository (faster) + +If you have already cloned the repository, you can use local source for faster builds: + +```bash +git clone https://github.com/CCExtractor/ccextractor.git +cd ccextractor + +# Default build (OCR enabled) +docker build --build-arg USE_LOCAL_SOURCE=1 -f docker/Dockerfile -t ccextractor . + +# Minimal build +docker build --build-arg USE_LOCAL_SOURCE=1 --build-arg BUILD_TYPE=minimal -f docker/Dockerfile -t ccextractor . + +# HardSubX build +docker build --build-arg USE_LOCAL_SOURCE=1 --build-arg BUILD_TYPE=hardsubx -f docker/Dockerfile -t ccextractor . +``` + +## Build Arguments + +| Argument | Default | Description | +|----------|---------|-------------| +| `BUILD_TYPE` | `ocr` | Build variant: `minimal`, `ocr`, or `hardsubx` | +| `USE_LOCAL_SOURCE` | `0` | Set to `1` to use local source instead of cloning | +| `DEBIAN_VERSION` | `bookworm-slim` | Debian version to use as base | + ## Usage -The CCExtractor Docker image can be used in several ways, depending on your needs. +### Basic Usage ```bash -# General usage -$ docker run ccextractor:latest +# Show version +docker run --rm ccextractor --version + +# Show help +docker run --rm ccextractor --help ``` -1. Process a local file & use `-o` flag +### Processing Local Files -To process a local video file, mount a directory containing the input file inside the container: +Mount your local directory to process files: ```bash -# Use `-o` to specifying output file -$ docker run --rm -v $(pwd):$(pwd) -w $(pwd) ccextractor:latest input.mp4 -o output.srt +# Process a video file with output file +docker run --rm -v $(pwd):$(pwd) -w $(pwd) ccextractor input.mp4 -o output.srt -# Alternatively use `--stdout` feature -$ docker run --rm -v $(pwd):$(pwd) -w $(pwd) ccextractor:latest input.mp4 --stdout > output.srt +# Process using stdout +docker run --rm -v $(pwd):$(pwd) -w $(pwd) ccextractor input.mp4 --stdout > output.srt ``` -Run this command from where your input video file is present, and change `input.mp4` & `output.srt` with the actual name of files. - -2. Enter an interactive environment - -If you need to run CCExtractor with additional options or perform other tasks within the container, you can enter an interactive environment: -bash +### Interactive Mode ```bash -$ docker run --rm -it --entrypoint='sh' ccextractor:latest +docker run --rm -it --entrypoint=/bin/bash ccextractor ``` -This will start a Bash shell inside the container, allowing you to run CCExtractor commands manually or perform other operations. +## Image Size -### Example - -I run help command in image built from `dockerfile` - -```bash -$ docker build -t ccextractor . -$ docker run --rm ccextractor:latest --help -``` - -This will show the `--help` message of CCExtractor tool -From there you can see all the features and flags which can be used. +The multi-stage build produces runtime images: +- `minimal`: ~130MB +- `ocr`: ~215MB (includes Tesseract) +- `hardsubx`: ~610MB (includes Tesseract + FFmpeg) diff --git a/docker/dockerfile b/docker/dockerfile deleted file mode 100644 index 908305eb..00000000 --- a/docker/dockerfile +++ /dev/null @@ -1,46 +0,0 @@ -FROM alpine:latest as base - -FROM base as builder - -RUN apk add --no-cache --update git curl gcc cmake glew glfw \ - tesseract-ocr-dev leptonica-dev clang-dev llvm-dev make pkgconfig \ - zlib-dev libpng-dev libjpeg-turbo-dev openssl-dev freetype-dev libxml2-dev bash cargo - -WORKDIR /root -RUN git clone -b v2.4.0 https://github.com/gpac/gpac -WORKDIR /root/gpac/ -RUN ./configure && make -j$(nproc) && make install-lib -WORKDIR /root -RUN rm -rf /root/gpac - -RUN git clone https://github.com/CCExtractor/ccextractor.git -WORKDIR /root/ccextractor/linux -RUN ./pre-build.sh && ./build - -RUN cp /root/ccextractor/linux/ccextractor /ccextractor && rm -rf ~/ccextractor - -FROM base as final - -COPY --from=builder /lib/ld-musl-x86_64.so.1 /lib/ -COPY --from=builder /usr/lib/libtesseract.so.5 /usr/lib/ -COPY --from=builder /usr/lib/libleptonica.so.6 /usr/lib/ -COPY --from=builder /usr/local/lib/libgpac.so.12 /usr/local/lib/ -COPY --from=builder /usr/lib/libstdc++.so.6 /usr/lib/ -COPY --from=builder /usr/lib/libgcc_s.so.1 /usr/lib/ -COPY --from=builder /usr/lib/libgomp.so.1 /usr/lib/ -COPY --from=builder /usr/lib/libpng16.so.16 /usr/lib/ -COPY --from=builder /usr/lib/libjpeg.so.8 /usr/lib/ -COPY --from=builder /usr/lib/libgif.so.7 /usr/lib/ -COPY --from=builder /usr/lib/libtiff.so.6 /usr/lib/ -COPY --from=builder /usr/lib/libwebp.so.7 /usr/lib/ -COPY --from=builder /usr/lib/libwebpmux.so.3 /usr/lib/ -COPY --from=builder /usr/lib/libz.so.1 /lib/ -COPY --from=builder /usr/lib/libssl.so.3 /lib/ -COPY --from=builder /usr/lib/libcrypto.so.3 /lib/ -COPY --from=builder /usr/lib/liblzma.so.5 /usr/lib/ -COPY --from=builder /usr/lib/libzstd.so.1 /usr/lib/ -COPY --from=builder /usr/lib/libsharpyuv.so.0 /usr/lib/ - -COPY --from=builder /ccextractor / - -ENTRYPOINT [ "/ccextractor" ]