Compare commits

..

358 Commits

Author SHA1 Message Date
Carlos Fernandez Sanz
270c89b7f8 [FEATURE]: Add Snap packaging support with Github workflow 2026-01-31 17:52:06 -08:00
Carlos Fernandez Sanz
032cd1c6b1 Merge pull request #2040 from THE-Amrit-mahto-05/fix/avc-sei-payload-size
Fix SEI payload type handling: changes payload_type and payload_size from i32 to u32 for type safety, keeping as usize casts only where needed for indexing.
2026-01-31 17:35:40 -08:00
Carlos Fernandez Sanz
42e4e9a657 Merge pull request #2049 from THE-Amrit-mahto-05/fix-null-len-guard
Adds defensive null pointer and negative length checks to ccxr_verify_crc32 FFI function to prevent undefined behavior.
2026-01-31 17:18:31 -08:00
Carlos Fernandez Sanz
821e307333 Merge pull request #2076 from THE-Amrit-mahto-05/fix-miri-null-deref
Verified with Miri - fixes undefined behavior when calling dealloc() on null pointer in window row deallocation.
2026-01-31 13:58:48 -08:00
Amrit kumar Mahto
ae81f3ba3d Fix Miri-reported UB in window row deallocation and tests 2026-01-31 00:49:50 +05:30
Carlos Fernandez Sanz
b190751b2c [FIX]macOS: Fix hardsub pipeline failing due to arm64/x86_64 build mismatch 2026-01-28 18:30:38 -08:00
GAURAV KARMAKAR
f1bb0f4dce macOS: Fix hardsub pipeline failing due to arm64/x86_64 build mismatch 2026-01-29 00:12:09 +05:30
Amrit kumar Mahto
f147ac27f8 re running for CI to pass checks 2026-01-27 21:03:19 +05:30
Amrit kumar Mahto
2dfb44d7d4 re running CI 2026-01-27 20:42:53 +05:30
Carlos Fernandez Sanz
580e721dfe fix: prevent heap overflow in parse_PAT/parse_PMT and null deref in processmp4 2026-01-23 23:06:35 -08:00
Carlos Fernandez
d0a82447ff fix(rust): resolve clippy unnecessary_unwrap warnings for Rust 1.93
Use if-let patterns instead of is_some() + unwrap() to satisfy
the stricter clippy::unnecessary_unwrap lint in Rust 1.93.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-23 20:58:03 -08:00
Carlos Fernandez
5c19c7b932 style: fix Rust formatting in parser.rs test
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-23 20:14:26 -08:00
Carlos Fernandez
fd7271bae2 fix: prevent heap overflow in parse_PAT/parse_PMT and null deref in processmp4
- parse_PAT: Add bounds check for payload_length >= 8 before accessing
  header fields (fixes #2053)
- parse_PMT: Add ES_info_length validation and 2-byte minimum check
  before reading descriptor_tag and desc_len in PRIVATE_USER_MPEG2
  and teletext parsing loops (fixes #2054)
- processmp4: Add NULL check for file parameter before passing to
  mprint (fixes #2055)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-23 20:12:09 -08:00
Chandragupt Singh
05c68349d5 Merge branch 'master' into feat/snap-distribution-support 2026-01-23 15:26:59 +05:30
Chandragupt Singh
09f21f64e4 fix(snap): resolve GPAC dependency and runtime issues in core22 snap 2026-01-23 15:23:33 +05:30
Carlos Fernandez Sanz
c65fb0874e fix(rust): correct mkvlang test to use MkvLangFilter type 2026-01-19 07:43:15 -08:00
Carlos Fernandez
9db727d593 fix(rust): correct mkvlang test to use MkvLangFilter type
The test_mkvlang_sets_mkv_language test was comparing against
Language::Eng, but the mkvlang field type was changed to MkvLangFilter
when BCP 47 language tag support was added in PR #2038.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-19 07:41:36 -08:00
Amrit kumar Mahto
fe6dad83b7 use u32 for SEI payload type and size 2026-01-19 14:16:50 +05:30
Carlos Fernandez Sanz
d494286082 ci: add workflow to build .deb packages 2026-01-18 20:37:22 -08:00
Carlos Fernandez
259e881483 fix(ci): add missing FFmpeg dependencies to hardsubx .deb packages
Add libavdevice, libswresample, and libavfilter dependencies for
the hardsubx variant on both Ubuntu 24.04 and Debian 13 workflows.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-18 20:11:10 -08:00
Carlos Fernandez
197069d3b8 ci: add Debian 13 (Trixie) .deb build workflow
Creates .deb packages for Debian 13 using a Docker container.
- Builds GPAC from source (abi-16.4 tag)
- Creates basic and hardsubx variants
- Uses Debian 13's library versions:
  - libtesseract5, libleptonica6
  - libavcodec61, libavformat61, libavutil59, libswscale8

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-18 20:02:16 -08:00
Carlos Fernandez
7a810d736d fix(ci): add libcurl3t64-gnutls dependency to .deb package
CCExtractor is linked against libcurl-gnutls which requires this
runtime dependency on Ubuntu 24.04.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-18 19:55:09 -08:00
Carlos Fernandez
1413c948c4 fix(ci): correct leptonica package name for Ubuntu 24.04
Ubuntu 24.04 uses liblept5, not libleptonica6 (which is Ubuntu 25.04).

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-18 19:39:42 -08:00
Carlos Fernandez
bb5385913b fix(ci): use apt install to handle .deb dependencies in test step
apt install automatically resolves and installs dependencies,
unlike dpkg -i which fails if dependencies are missing.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-18 19:13:41 -08:00
Carlos Fernandez Sanz
f8981e8e1e refactor(rust): Rename parser tests with descriptive names and expand coverage 2026-01-18 19:12:34 -08:00
Carlos Fernandez
a1871abf04 fix(ci): switch .deb build to Ubuntu 24.04
- Use ubuntu-24.04 runner instead of ubuntu-22.04
- Update dependencies to match Ubuntu 24.04 library versions
  (libtesseract5, libleptonica6, libavcodec60, etc.)
- Update GPAC cache key for new Ubuntu version

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-18 19:08:09 -08:00
Carlos Fernandez
20b3773bb9 fix(ci): correct version and add missing dependencies in .deb workflow
- Update CMakeLists.txt version from 0.89 to 0.96 to match lib_ccx.h
- Extract version from lib_ccx.h instead of CMakeLists.txt for accuracy
- Add missing runtime dependencies: libtesseract, libleptonica
- Add FFmpeg dependencies for hardsubx variant

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-18 19:02:48 -08:00
Carlos Fernandez
8786b4cf75 fix(ci): correct LICENSE filename to LICENSE.txt 2026-01-18 18:08:04 -08:00
Carlos Fernandez
8632ecda5b ci: add workflow to build .deb packages
Add GitHub Actions workflow to build Debian packages (.deb) for Linux.

Features:
- Builds GPAC from source (abi-16.4 tag) since libgpac-dev is not
  available in newer Debian/Ubuntu releases
- Creates two variants: basic (with OCR) and hardsubx (with FFmpeg)
- Bundles GPAC library with the package using patchelf for rpath
- Includes proper Debian package structure with control, postinst, postrm
- Runs on releases, manual trigger, or workflow file changes
- Uploads packages as artifacts and attaches to releases

This provides an unofficial .deb package for users who prefer that
format over AppImage or snap.

Relates to #1610

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-18 18:00:45 -08:00
Carlos Fernandez Sanz
475153a9dd fix(build): resolve Rust-to-C linking issues on Linux 2026-01-18 17:39:27 -08:00
Carlos Fernandez
df90009f73 ci: add CMakeLists.txt to workflow path filters
Build workflows were not triggering on CMakeLists.txt changes.
Added **CMakeLists.txt and **.cmake patterns to path filters for:
- build_linux.yml
- build_mac.yml
- build_windows.yml
- build_docker.yml

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-18 17:23:34 -08:00
Carlos Fernandez
2352ea21e3 fix(build): resolve Rust-to-C linking issues on Linux
Two fixes for static library linking:

1. Preserve CMAKE_C_FLAGS in lib_ccx/CMakeLists.txt instead of
   overwriting them. This allows passing include paths via
   -DCMAKE_C_FLAGS which is needed for some build configurations.

2. Add target_link_options with --undefined flags for C functions
   called from Rust (decode_vbi, do_cb, store_hdcc). With static
   libraries, the linker processes them in order and only pulls
   symbols that are currently unresolved. Since ccx is processed
   before ccx_rust, these symbols weren't being pulled in.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-18 17:11:44 -08:00
Carlos Fernandez Sanz
dc041a35e8 fix(rust): Support BCP 47 language tags in --mkvlang option 2026-01-18 16:33:39 -08:00
Carlos Fernandez Sanz
e99ba1d177 fix(rust): Remove dead code returning pointer to stack variable 2026-01-18 14:11:39 -08:00
Carlos Fernandez
298665faa4 chore: fix cargo fmt formatting
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-18 13:57:53 -08:00
Carlos Fernandez
735a01bf04 refactor(rust): rename parser tests with descriptive names and expand coverage
Replace poorly-named tests (options_1 through options_51, broken_1, etc.)
with 201 descriptively-named tests organized by category:

- Input/output format tests
- Encoding tests
- Stream/program selection tests
- CEA-708 service tests
- Codec selection tests
- Timing option tests
- Debug flag tests
- Teletext option tests
- XMLTV option tests
- Credits option tests
- Buffering option tests
- And more

Each test name now clearly indicates what CLI option is being tested
and what behavior is expected, e.g.:
- test_input_ts_sets_transport_stream_mode
- test_608_enables_decoder_608_debug
- test_service_enables_708_with_single_service

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-18 13:55:56 -08:00
Amrit kumar Mahto
3618c23b5a rust/avc: fix SEI payload size handling and type correctness 2026-01-19 03:09:07 +05:30
Carlos Fernandez Sanz
b7c9da75dd Revert "Automatic extraction of multiple DVB subtitle streams (--split-dvb-subs) fixes#447 #1864"
Was incorrectly merged
2026-01-18 13:37:53 -08:00
Carlos Fernandez Sanz
449d55d5e5 Revert "Automatic extraction of multiple DVB subtitle streams (--split-dvb-subs) fixes#447 #1864" 2026-01-18 13:37:26 -08:00
Carlos Fernandez Sanz
60aa370899 fix(rust): Correct version number in CLI parser 2026-01-18 13:35:25 -08:00
Carlos Fernandez
3d18b38c32 Revert "Merge pull request #1912 from Rahul-2k4/final"
This reverts commit 2a6d27f9ff, reversing
changes made to 74e64c0421.
2026-01-18 13:28:15 -08:00
Carlos Fernandez Sanz
2a6d27f9ff Merge pull request #1912 from Rahul-2k4/final
Automatic extraction of multiple DVB subtitle streams (--split-dvb-subs) fixes#447 #1864
2026-01-18 13:27:17 -08:00
Carlos Fernandez
91d3512bcc fix(rust): Support BCP 47 language tags in --mkvlang option
The --mkvlang option previously only supported single ISO 639-2 codes
due to using a Language enum with a fixed list of variants. Extended
codes (like "fre-ca") and multiple codes (like "eng,chi") would panic.

This change introduces MkvLangFilter, a proper type for language
filtering that:

- Validates language codes per BCP 47 specification
- Supports ISO 639-2 (3-letter codes like "eng")
- Supports BCP 47 tags (like "en-US", "zh-Hans-CN")
- Supports comma-separated multiple codes
- Provides clean error messages for invalid input
- Includes comprehensive unit tests

The C code continues to receive the raw string for strstr() matching,
maintaining backward compatibility.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-18 13:23:39 -08:00
Carlos Fernandez Sanz
74e64c0421 Merge pull request #2035 from THE-Amrit-mahto-05/fix/mkvlang-params-check
fix mkvlang_params_check: prevent panic on multi-byte characters
2026-01-18 13:07:44 -08:00
Carlos Fernandez
c175750ebe fix(rust): Correct version number in CLI parser
The Rust CLI parser was showing "CCExtractor 1.0" instead of the
actual version (0.96.5). This was a placeholder value from when
the parser was first ported to Rust in August 2024 that was never
updated.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-18 12:55:21 -08:00
Carlos Fernandez Sanz
e7dc4d19f7 Merge pull request #2036 from THE-Amrit-mahto-05/fix/process-word-file-safely
fix: process_word_file propagates errors instead of panicking
2026-01-18 12:51:33 -08:00
Carlos Fernandez Sanz
1fbb51056d Merge pull request #1992 from THE-Amrit-mahto-05/fix/teletext-panic
fix: Teletext decoder panic on malformed BCD data
2026-01-18 12:46:56 -08:00
Carlos Fernandez Sanz
5d9a8cc6f2 Merge pull request #2031 from THE-Amrit-mahto-05/fix/rust-userdata-uaf
Fix use-after-free bugs in Rust userdata handling
2026-01-18 12:24:10 -08:00
Amrit kumar Mahto
17abad79f2 fix: process_word_file propagates errors instead of panicking 2026-01-19 01:53:19 +05:30
Amrit kumar Mahto
707e1f01fe updating 2026-01-19 01:34:41 +05:30
Amrit kumar Mahto
efc8b791e7 fix mkvlang_params_check: prevent panic on multi-byte characters 2026-01-19 01:28:25 +05:30
Carlos Fernandez Sanz
a856bbde10 Merge pull request #2015 from Harsh-Sahu43/tests/validate-cc-pair
[FIX] rust: add defensive length check to validate_cc_pair
2026-01-18 11:52:49 -08:00
Carlos Fernandez Sanz
9390b876fa Merge pull request #2034 from THE-Amrit-mahto-05/fix/parser-atol-bug
Fix atol Parsing Bug in parser.rs for Numeric Values and Suffixes
2026-01-18 11:38:53 -08:00
Amrit kumar Mahto
ead0a4beed little fix 2026-01-19 00:45:30 +05:30
Amrit kumar Mahto
b2e9cb74c1 Fix atol parsing bug for numeric values and K/M/G suffixes 2026-01-19 00:31:25 +05:30
Amrit kumar Mahto
20b194aac4 Consolidate Rust userdata fixes: UAF, bounds checks, and VBI safety 2026-01-18 23:34:43 +05:30
Harsh Sahu
2d9b480972 Merge branch 'CCExtractor:master' into tests/validate-cc-pair 2026-01-18 14:48:46 +05:30
Harsh Sahu
1447b021cb Fixed : formatting 2026-01-18 13:58:31 +05:30
Amrit kumar Mahto
e0ac126cff Fix use-after-free bugs in Rust userdata handling 2026-01-18 05:37:44 +05:30
Carlos Fernandez Sanz
b8019bdb35 [FIX] Resolve output artifact on Linux/WSL (line clearing) 2026-01-17 06:02:59 -08:00
Carlos Fernandez Sanz
9d921dec43 fix(matroska): prevent out-of-bounds NAL parsing in AVC/HEVC blocks 2026-01-17 06:00:12 -08:00
Carlos Fernandez Sanz
3ada2b5002 fix(avc): prevent segfault in report-only mode (-out=report) 2026-01-17 05:58:03 -08:00
Rahul Tripathi
50ec9866db style: Fix clang-format ternary operator alignment 2026-01-17 14:12:59 +05:30
Rahul Tripathi
ce87d01fbd fix: Cap DVB subtitle duration to 10s to prevent 65s page timeout bug
Root cause: When FTS timestamps were invalid due to PTS discontinuities,
the code fell back to DVB page timeout (65 seconds) as subtitle duration.
This caused impossible 65-second subtitle durations in split output.

Fix: Added DVB_MAX_SUBTITLE_DURATION_MS constant (10s) and simplified the
duration capping logic to always enforce reasonable subtitle durations.

Tested with: multiprogram_spain.ts, BBC1.ts, BBC2.ts - all outputs now
have properly capped durations with no timestamps exceeding 10 seconds.
2026-01-17 12:14:12 +05:30
Carlos Fernandez
fecd24d08e fix(avc): prevent segfault in report-only mode (-out=report)
When using -out=report mode, the encoder context (enc_ctx) is NULL
because no output file needs to be created. The Rust FFI function
ccxr_process_avc was dereferencing this NULL pointer, causing a
segmentation fault.

Add NULL pointer checks at the FFI boundary to skip AVC processing
when enc_ctx is NULL. This is safe because report mode only needs
stream analysis, not caption extraction.

Fixes #2023

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-16 20:50:42 -08:00
Rahul Tripathi
482544c5bf docs: Add DVB deduplication feature and double-free fix to CHANGES.TXT 2026-01-16 16:41:46 +05:30
Rahul Tripathi
84a7a1fb41 style: Fix remaining clang-format indentation issues 2026-01-16 16:34:26 +05:30
Rahul Tripathi
f198bcd2ec style: Fix clang-format issues across modified files 2026-01-16 16:31:09 +05:30
Rahul Tripathi
4b6016ca1c style: Fix clang-format issues in dvb_dedup files 2026-01-16 16:26:28 +05:30
Rahul Tripathi
9c2ea47eda fix: Add dvb_dedup.c to Windows and Mac build systems 2026-01-16 16:24:52 +05:30
Rahul Tripathi
170b466a20 fix: Add dvb_dedup.c to autoconf build for GitHub Actions Linux CI 2026-01-16 16:23:43 +05:30
Rahul Tripathi
2bdcd20115 cleanup: Remove temporary debug, test, and tool artifacts from final branch
Remove 186 unwanted files including:
- Debug logs and diagnostic output (debug_*.log, debug_output/, diagnosis_output/)
- Test artifacts and binaries (linux/alltests_*, test_output/, test_split_verification/)
- Tool state files (.agent/, .claude/, .ralph/, .mcp.json, etc.)
- Root-level scripts and temporary Python utilities
- Working notes and temporary documentation (DVB_SPLIT_*.md, progress.json, etc.)
- Unfinished MCP server (tools/mcp-ccextractor/)
- Project-specific working notes (CLAUDE.md)

Update .gitignore to prevent re-adding unwanted artifacts.

Result: final branch now contains only DVB-split feature implementation
and core project files, matching upstream structure while preserving
all functional changes.
2026-01-16 16:18:02 +05:30
Rahul Tripathi
ab18d234d2 Merge branch 'CCExtractor:master' into final 2026-01-16 16:05:36 +05:30
Rahul Tripathi
3ff02617b0 fix: Resolve double-free crash in DVB split pipeline cleanup
- Remove redundant free() after free_subtitle() in pipeline cleanup
  (free_subtitle already frees the struct via freep(&sub))
- Add ctx->prev = NULL after free_encoder_context in dinit_encoder
- Keep free_encoder_context non-recursive for prev (dinit_encoder owns it)
- Remove debug output from general_loop.c
2026-01-16 16:02:59 +05:30
Rahul Tripathi
c7fad95e24 test: Fix DVB dedup test suite - DVB-005 and DVB-007 corrections
- DVB-005: Changed from Teletext-only file to proper DVB extraction using --program-number 530
- DVB-007: Fixed shell script globbing error and variable parsing for dedup effectiveness check
- All test cases now pass: DVB-004 (multilingual split), DVB-005 (single program), DVB-006 (non-DVB), DVB-007 (dedup check), DVB-008 (no-dedup flag)
- Verified: No 0-byte files, deduplication removes 19-29 duplicate lines per stream
2026-01-16 15:05:35 +05:30
Rahul Tripathi
c018f1f43c docs: Mark DVB-004 through DVB-008 as complete
- All deduplication infrastructure implemented and tested
- Test script validates code paths execute correctly
- Dedup ring buffer integrated into all DVB subtitle processing
- Full validation requires OCR build (-DWITH_OCR=ON)
- Code review confirms all 8 stories are complete
2026-01-16 14:15:44 +05:30
Rahul Tripathi
98b50b2a35 test: Add DVB dedup test suite script
- Created dvb_dedup_test.sh to test DVB-001 through DVB-008
- Tests multilingual split, single stream, non-DVB files
- Tests --no-dvb-dedup flag functionality
- Checks for excessive duplication in output
- Note: Requires OCR (Tesseract) for full validation
- Without OCR, files are empty but dedup logic still executes
2026-01-16 14:15:03 +05:30
Rahul Tripathi
46cee0893a feat: DVB-003 - Add --no-dvb-dedup CLI flag
- Added no_dvb_dedup field to ccx_s_options structure
- Initialized to 0 (deduplication enabled by default)
- Added --no-dvb-dedup CLI flag in Rust args parser
- Added flag to Options struct in lib_ccxr
- Wired flag through Rust-to-C FFI boundary in common.rs
- Modified dvbsub_handle_display_segment to respect flag
- Dedup logic only runs when no_dvb_dedup is false (default)
- Added help text describing flag purpose
2026-01-16 14:11:13 +05:30
Rahul Tripathi
42ad48ca7f feat: DVB-001 - Add per-stream dedup ring buffer
- Created dvb_dedup.h with dedup_entry and dedup_ring structures
- Implemented dvb_dedup.c with init, is_duplicate, and add functions
- Integrated dedup_ring into DVBSubContext structure
- Added deduplication check in dvbsub_handle_display_segment
- Dedup uses PTS + PID + composition_id + ancillary_id as unique key
- 8-slot ring buffer to track recently emitted subtitles
- Prevents duplicate subtitles from propagating to output files
2026-01-16 14:04:00 +05:30
Akhilesh
ed26a595bd style(matroska): apply clang-format 2026-01-14 13:42:22 +05:30
Akhilesh
b1c2aabb22 fix(matroska): prevent out-of-bounds NAL parsing in AVC/HEVC blocks 2026-01-14 13:20:23 +05:30
Rahul Tripathi
bb2ae1e70f Fix DVB subtitle repetition bug and memory safety issues 2026-01-13 20:29:44 +05:30
Rahul Tripathi
6464fa486e Fix DVB Split: Remove forced dirty flag, rely on natural dirty + clear 2026-01-13 18:16:41 +05:30
Rahul Tripathi
5aa747ab33 Fix DVB Split bugs: Prevent subtitle repetition and buffer overflow crash 2026-01-13 17:53:30 +05:30
Rahul Tripathi
39adfa59b0 Fix Bug 1: Clear OCR text leakage preventing subtitle repetition
- Clear enc_ctx->prev->last_str after encode_sub() in dvb_subtitle_decoder.c
- This prevents OCR-recognized text from leaking into subsequent subtitles
- Tested: All subtitle output shows unique text with zero duplicates
2026-01-12 11:00:27 +05:30
Carlos Fernandez Sanz
20287548cb fix: Correct progress time display for multi-program TS files 2026-01-11 20:56:59 +01:00
collectnis
b7b10419ec style: fix formatting alignment 2026-01-11 13:46:00 +00:00
collectnis
8fbfd68426 style: fix formatting alignment 2026-01-11 13:31:55 +00:00
collectnis
7159d0b6d0 fix: resolve merge conflict in changelog 2026-01-11 11:48:58 +00:00
collectnis
c515578e37 docs: update changelog 2026-01-11 11:30:54 +00:00
collectnis
e55b8eb764 [CLI] Fix output artifacts on Linux/WSL by clearing line on \r 2026-01-11 10:34:16 +00:00
Carlos Fernandez Sanz
0228fbcbfa fix: Skip moov box if buffer too small to verify mvhd 2026-01-11 10:30:32 +01:00
Carlos Fernandez Sanz
0e190e0962 docs: Add changelog for 0.96.6 2026-01-11 10:29:57 +01:00
Carlos Fernandez
13f1b5ab53 docs: Add changelog for 0.96.6
🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-11 10:28:56 +01:00
Carlos Fernandez Sanz
b39f923c46 docs: Clarify PS probe limit calculation (explain magic number) 2026-01-11 08:55:17 +01:00
Harsh Sahu
7e32d6a553 Merge branch 'CCExtractor:master' into tests/validate-cc-pair 2026-01-11 04:51:33 +05:30
Carlos Fernandez
3bde3dceec fix: Skip moov box if buffer too small to verify mvhd
The previous fix (#1996) prevented a panic when the buffer was too small
to verify if a "moov" box contains "mvhd", but it incorrectly accepted
the box without verification.

The original intent was: "moov without mvhd is invalid, skip it."

This fix maintains that intent:
- If buffer too small to verify mvhd → skip the box
- If moov has mvhd → accept (valid)
- If moov lacks mvhd → skip (invalid)

This is safe for format detection since:
1. The probe reads up to 1MB of start bytes
2. The scoring system requires multiple valid boxes
3. Skipping an unverifiable box is safer than accepting it

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-11 00:13:11 +01:00
Carlos Fernandez
d5201b1129 docs: Clarify PS probe limit calculation with inline comment
Replace magic number 49997 with `50000 - 3` and add a comment explaining:
- Why we subtract 3 (the loop accesses i+3, so we stop 3 bytes early)
- Why we cap at 50000 (don't scan huge buffers entirely)
- Why we use saturating_sub (handle tiny buffers safely)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-11 00:07:35 +01:00
Carlos Fernandez Sanz
a199f4f8af Merge pull request #1996 from THE-Amrit-mahto-05/fix/demuxer-panics
fix prevent MP4 & PS demuxer panics due to out-of-bounds/underflow
2026-01-11 00:06:35 +01:00
Harsh Sahu
eea049923d add defensive length check to validate_cc_pair 2026-01-11 04:21:00 +05:30
Carlos Fernandez Sanz
d999c3e0e0 Merge pull request #1985 from x15sr71/docs/homebrew-install
docs: Add Homebrew installation instructions to COMPILATION.MD
2026-01-10 23:43:42 +01:00
Carlos Fernandez
aac90d5a5f fix(rust): Remove dead code returning pointer to stack variable
Delete the unused `impl FromCType<*mut PMT_entry> for *mut PMTEntry`
implementation which had a critical bug: it returned a pointer to a
stack-allocated PMTEntry, causing undefined behavior (dangling pointer).

This code was never called anywhere in the codebase. The actual usage
in demuxer.rs uses the value-returning variant `FromCType<PMT_entry>
for PMTEntry` with explicit `Box::into_raw(Box::new(...))` wrapping,
which is the correct pattern.

Rather than fixing dead buggy code, just remove it.

Supersedes #1988

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-10 23:41:32 +01:00
Carlos Fernandez Sanz
618df184c6 Merge pull request #2011 from THE-Amrit-mahto-05/fix/demuxer-allocator-mismatch
Fix allocator mismatch in Rust demuxer (use malloc/free instead of Box)
2026-01-10 23:21:16 +01:00
Chandragupt
5e6aab8972 fix(snap): drop snap-injected command argument in runtime wrapper 2026-01-11 01:10:29 +05:30
Amrit kumar Mahto
a77c21c06c fix: allocator mismatch in demuxer (use malloc/free instead of Box) 2026-01-11 00:49:17 +05:30
Carlos Fernandez Sanz
4252703431 fix(matroska): Prevent infinite loop on truncated MKV files 2026-01-10 13:16:12 +01:00
Carlos Fernandez Sanz
1af2a29a3c fix: Prevent NULL pointer dereference in DVB subtitle decoder 2026-01-10 11:18:56 +01:00
Carlos Fernandez Sanz
8ab474c593 fix: Remove debug println that printed spurious numbers during processing 2026-01-10 11:18:20 +01:00
Carlos Fernandez
1c781c2a38 fix: Correct progress time display for multi-program TS files
Multi-program transport stream files can have different PCR (Program
Clock Reference) bases for each program. For example, one program might
have timestamps starting at 23 hours, another at 25 hours. This caused
the progress time display to show wildly incorrect values like "265:45"
for a 6-second file.

The fix tracks the minimum timestamp offset seen across all programs and
uses that as the baseline. When timestamps from programs with higher PCR
bases are encountered (offset > 60 seconds from minimum), the display
falls back to showing time relative to the minimum baseline.

Changes:
- Add min_global_timestamp_offset field to lib_ccx_ctx to track the
  minimum PCR-based offset seen
- Update progress display logic in general_loop.c to normalize times
  relative to the minimum offset
- Apply same fix to both live stream and file processing modes

Test results with multi-program DVB teletext sample (dvbt.ts):
- Before: 1% | 265:45, 2% | 00:00, 3% | 263:11, ... (jumping wildly)
- After:  1% | 00:00, 2% | 00:00, ... 87% | 00:05, 100% | 00:00 (stable)

Single-program files continue to work correctly.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-10 10:57:57 +01:00
Carlos Fernandez
4d718378d5 fix: Remove debug println that printed spurious numbers during processing
Removes a debug println statement in the Rust timestamp conversion code
that was printing the hours value when it exceeded 24. This caused
spurious numbers (like "25") to appear in the output when processing
files with PTS timestamps that exceeded 24 hours.

The debug code was likely left over from development/debugging and
should not be present in production code.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-10 10:50:33 +01:00
Carlos Fernandez
1bd4cd5c0a fix: Prevent NULL pointer dereference in DVB subtitle decoder
Add NULL check for `region` before accessing `region->bgcolor` in
the OCR processing block of `write_dvb_sub()`.

The bug occurs when processing DVB subtitles where `get_region()`
returns NULL for all display items in the list. After the display
processing loop, `region` may be NULL, but the code attempted to
access `region->bgcolor` unconditionally, causing a segfault.

The crash manifested as:
- Valgrind: "Invalid read of size 4 at address 0x18"
- The 0x18 offset corresponds to the `bgcolor` field in DVBSubRegion

Testing with bbc_small.ts:
- Before: SIGSEGV crash at 0% processing
- After: 100% processing, 50+ subtitles extracted successfully

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-10 10:20:50 +01:00
Carlos Fernandez
067045ce92 fix(matroska): Prevent infinite loop on truncated MKV files
When parsing truncated MKV files, the Matroska parser would enter an
infinite loop. This happened because:

1. At EOF, fgetc() returns -1 which becomes 0xFF when cast to UBYTE
2. Reading 4 EOF bytes creates element code 0xFFFFFFFF (unknown element)
3. The "skip unknown element" logic reads another 0xFF as vint length (127)
4. FSEEK past EOF clears the EOF flag without error
5. The while loop condition (pos + len > get_current_byte) never becomes
   false because the recorded segment length is larger than the file

The fix adds feof() checks after each mkv_read_byte() call in all
parsing loops. This detects EOF immediately after reading and breaks
out of the loop cleanly.

Tested with truncated MKV samples (ticket1398-orig.mkv, azumi.mkv)
that previously caused timeouts - now complete in under a second.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-10 09:50:21 +01:00
Carlos Fernandez Sanz
2f2904041c prevent unsafe Vec::set_len causing heap corruption 2026-01-09 23:45:34 +01:00
Carlos Fernandez Sanz
d837c369e5 fix prevent FFI memory leaks in demuxer sync 2026-01-09 23:44:52 +01:00
Carlos Fernandez Sanz
686ff69fdc Docs: clarify Linux autotools build and Rust dependency 2026-01-09 23:43:10 +01:00
Carlos Fernandez Sanz
126835d998 Merge pull request #1850 from gaurav02081/gaurav-v1
[FIX] -out=spupng with EIA608/teletext: offset values in XML may be not correct #893
2026-01-09 23:25:58 +01:00
Akhilesh
6e170cd812 Docs: clarify Linux autotools build and Rust dependency 2026-01-09 21:02:18 +05:30
Rahul Tripathi
fe921626e1 Fix: Off-by-one bounds check and encoding corruption
- telxcc.c: Use array_length macro for G0_LATIN_NATIONAL_SUBSETS
  bounds check instead of hardcoded value. Prevents potential
  access to uninitialized memory when index equals array size.
- misc.h: Fix UTF-8 encoding of author name (Iñaki García Etxebarria)
2026-01-09 16:02:10 +05:30
Amrit kumar Mahto
6578f0ff34 fix(avc): prevent unsafe Vec::set_len causing heap corruption 2026-01-09 05:15:57 +05:30
Amrit kumar Mahto
1911068e92 fix(rust): prevent FFI memory leaks in demuxer sync 2026-01-08 14:46:56 +05:30
Chandragupt
493495361d ci(snap): use stable GitHub Actions v6 and make runtime library resolution robust 2026-01-08 09:24:25 +05:30
Chandragupt
643857e98f docs: add changelog entry for Snap packaging 2026-01-08 06:09:33 +05:30
Chandragupt
05adb5f47e snap: add website and source-code metadata 2026-01-08 06:08:29 +05:30
Chandragupt
504877b928 ci(snap): remove temporary push trigger 2026-01-08 06:08:29 +05:30
Chandragupt
64ee63a560 ci(snap): enable push trigger for snap workflow (temporary) 2026-01-08 06:08:00 +05:30
Chandragupt
270c603bd2 ci(snap): add GitHub Actions workflow for Snapcraft-based builds 2026-01-08 06:06:13 +05:30
dependabot[bot]
6d356b4458 chore(deps): bump dawidd6/action-homebrew-bump-formula from 4 to 7 (#1989)
Bumps [dawidd6/action-homebrew-bump-formula](https://github.com/dawidd6/action-homebrew-bump-formula) from 4 to 7.
- [Release notes](https://github.com/dawidd6/action-homebrew-bump-formula/releases)
- [Commits](https://github.com/dawidd6/action-homebrew-bump-formula/compare/v4...v7)

---
updated-dependencies:
- dependency-name: dawidd6/action-homebrew-bump-formula
  dependency-version: '7'
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2026-01-08 01:24:47 +01:00
Carlos Fernandez Sanz
cfb10d4b91 fix: Delete empty output files instead of leaving 0-byte files (#1282) (#1877)
When using --output-field both (formerly -12), CCExtractor creates
separate output files for each field. If one field has no captions,
a 0-byte file was left behind, which is confusing for users.

This fix checks the file size in dinit_write() before closing.
If the file is empty (0 bytes), it deletes the file and prints
an informational message.

This is a simpler approach than deferred file creation - files are
still created at initialization but cleaned up if they remain empty.

Fixes #1282

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-08 01:23:28 +01:00
Amrit kumar Mahto
ca2b708023 fix: prevent MP4 & PS demuxer panics due to out-of-bounds/underflow (#1995) 2026-01-08 02:36:30 +05:30
Amrit kumar Mahto
10ac5ca6ce add safety checks and comments in Teletext decoder 2026-01-08 01:42:09 +05:30
Amrit kumar Mahto
333cfb3726 fix: Teletext decoder panic on malformed BCD data (#1990) 2026-01-08 01:26:17 +05:30
GAURAV KARMAKAR
c609f66c02 Removed Build Artifact 2026-01-08 01:03:54 +05:30
Gaurav karmakar
91f254017b Merge branch 'master' into gaurav-v1 2026-01-08 00:47:22 +05:30
GAURAV KARMAKAR
1f5d3df0ae Merge branch 'master' of https://github.com/gaurav02081/ccextractor into gaurav-v1 2026-01-08 00:35:33 +05:30
Rahul Tripathi
e36d81c237 Git Cleanup: Update .gitignore and untrack build artifacts 2026-01-07 21:38:36 +05:30
Rahul Tripathi
8d338dc362 Fix DVB subtitle repeating bug: initialize nb_data 2026-01-07 21:37:23 +05:30
Rahul Tripathi
c78e01d186 Merge branch 'CCExtractor:master' into final 2026-01-06 12:31:17 +05:30
Chandragupt Singh
401ff6c105 docs: note Homebrew availability in changelog 2026-01-06 06:04:57 +05:30
Chandragupt Singh
83eb51ed6f docs: add Homebrew installation instructions 2026-01-06 06:01:56 +05:30
Carlos Fernandez
bce0c92fdd ci: Add Homebrew formula auto-bump workflow
Automatically creates a PR to homebrew-core when a new release
is published, updating the ccextractor formula to the new version.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-06 00:08:40 +01:00
Rahul Tripathi
ea4859fd54 Fix: Add split_dvb_subs to Options default 2026-01-05 21:39:54 +05:30
Rahul Tripathi
8d7890c743 Merge branch 'master' into final 2026-01-05 21:10:51 +05:30
Carlos Fernandez Sanz
477307e438 chore: Bump version to 0.96.5 2026-01-05 16:02:39 +01:00
Carlos Fernandez
4a4911bcec chore: Bump version to 0.96.5
Update version number across all packaging and build files for the
0.96.5 release.

Files updated:
- docs/CHANGES.TXT - Added changelog entry
- src/lib_ccx/lib_ccx.h - VERSION define
- linux/configure.ac - AC_INIT version
- mac/configure.ac - AC_INIT version
- OpenBSD/Makefile - V variable
- package_creators/PKGBUILD - pkgver
- package_creators/ccextractor.spec - Version
- package_creators/debian.sh - VERSION
- packaging/chocolatey/ccextractor.nuspec - version
- packaging/chocolatey/tools/chocolateyInstall.ps1 - URL
- packaging/winget/*.yaml - PackageVersion and URLs

Note: SHA256 checksums in chocolatey and winget files will need to be
updated after the MSI is built.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-05 12:44:06 +01:00
Carlos Fernandez Sanz
dc946168e7 Fix OOB read/write and length handling in CEA-608/708 decoders 2026-01-05 12:36:31 +01:00
Carlos Fernandez Sanz
3a60b1268b Merge pull request #1981 from CCExtractor/fix/epg-snprintf-buffer-warning
fix(epg): Silence snprintf buffer truncation warnings
2026-01-05 12:33:15 +01:00
Carlos Fernandez
e3d1c56ad0 fix(epg): Silence snprintf buffer truncation warnings
Extend EPG time string buffers from 21 to 74 bytes to silence
compiler warnings about potential buffer truncation.

The actual output is always 20 chars ("YYYYMMDDHHMMSS +0000") plus
null terminator, but the compiler warns because %02d with int
arguments could theoretically produce larger output.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-05 11:29:48 +01:00
Carlos Fernandez Sanz
b5bc0e2616 Fix OOB read/write in Teletext G0 charset remapping 2026-01-05 11:28:11 +01:00
Carlos Fernandez Sanz
600a9a0e75 Add support for raw CDP (Caption Distribution Packet) files 2026-01-05 10:55:59 +01:00
Amrit kumar Mahto
694b61f862 Fix OOB read/write in Teletext G0 charset remapping 2026-01-04 23:47:08 +05:30
Carlos Fernandez
86925727e0 Merge remote-tracking branch 'origin/master' into feat/issue-1406-raw-cdp-support 2026-01-04 17:20:04 +01:00
Carlos Fernandez Sanz
1c7515681e Fix MXF files containing CEA-708 captions not being detected/extracted 2026-01-04 17:17:33 +01:00
Carlos Fernandez Sanz
2bcac83761 Docs: Add Windows WSL build instructions 2026-01-04 14:33:34 +01:00
Carlos Fernandez
efc28d87d5 Trigger CI 2026-01-04 14:08:41 +01:00
Carlos Fernandez
b4d8e0ffaf Trigger CI 2026-01-04 14:08:26 +01:00
Carlos Fernandez
0b7b7fd031 Trigger CI 2026-01-04 12:56:44 +01:00
Carlos Fernandez
90041554a3 Fix Rust formatting and clippy issues
- Apply cargo fmt to decoder/mod.rs
- Fix clippy manual_flatten warning in build.rs by using .flatten()

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-04 12:55:35 +01:00
Carlos Fernandez
6950a7661e Fix MXF files containing CEA-708 captions not being detected/extracted
Root cause: CCX_RAW_TYPE data from MXF demuxer was not being passed to
the DTVCC decoder, only to the legacy 608 decoder via process_raw_with_field.

Changes:
- general_loop.c: Changed CCX_RAW_TYPE handling to use process_cc_data
  instead of process_raw_with_field to properly invoke DTVCC decoder
- general_loop.c: Added DTVCC activation for MXF/GXF sources since they
  may contain 708 captions
- general_loop.c: Initialize timing from caption PTS when not set
- ccx_dtvcc.h: Added ccxr_dtvcc_set_active FFI declaration
- lib.rs: Added ccxr_dtvcc_set_active function to enable DTVCC decoder
- decoder/mod.rs: Fixed flush logic to always process visible windows
- ccx_demuxer_mxf.c: Fixed PTS calculation to use 90kHz units based on
  edit_rate, and changed verbose logging to debug()

Fixes #1647

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-04 11:17:54 +01:00
Carlos Fernandez
41fb966f6f Add support for raw CDP (Caption Distribution Packet) files
Adds support for processing raw CDP files captured from SDI VANC
(e.g., from Blackmagic Decklink capture cards). CDP packets are
automatically detected by their 0x9669 identifier when using -in=raw.

Changes:
- Added process_raw_cdp() function to parse concatenated CDP packets
- Added CDP format detection in raw_loop() (checks for 0x9669 header)
- Extracts cc_data triplets from CDP packets and processes them
  through process_cc_data() for both CEA-608 and CEA-708 support
- Calculates timing based on CDP frame rate and packet count

Usage:
  ccextractor -in=raw captured_vanc.bin -o output.srt

Fixes #1406

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-04 09:54:37 +01:00
Carlos Fernandez
04ed95f8b5 Fix MXF files containing CEA-708 captions not being detected/extracted
Root cause: CCX_RAW_TYPE data from MXF demuxer was not being passed to
the DTVCC decoder, only to the legacy 608 decoder via process_raw_with_field.

Changes:
- general_loop.c: Changed CCX_RAW_TYPE handling to use process_cc_data
  instead of process_raw_with_field to properly invoke DTVCC decoder
- general_loop.c: Added DTVCC activation for MXF/GXF sources since they
  may contain 708 captions
- general_loop.c: Initialize timing from caption PTS when not set
- ccx_dtvcc.h: Added ccxr_dtvcc_set_active FFI declaration
- lib.rs: Added ccxr_dtvcc_set_active function to enable DTVCC decoder
- decoder/mod.rs: Fixed flush logic to always process visible windows
- ccx_demuxer_mxf.c: Fixed PTS calculation to use 90kHz units based on
  edit_rate, and changed verbose logging to debug()

Fixes #1647

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-04 09:54:20 +01:00
Carlos Fernandez
ddf29672fd Fix MXF files containing CEA-708 captions not being detected/extracted
Root cause: CCX_RAW_TYPE data from MXF demuxer was not being passed to
the DTVCC decoder, only to the legacy 608 decoder via process_raw_with_field.

Changes:
- general_loop.c: Changed CCX_RAW_TYPE handling to use process_cc_data
  instead of process_raw_with_field to properly invoke DTVCC decoder
- general_loop.c: Added DTVCC activation for MXF/GXF sources since they
  may contain 708 captions
- general_loop.c: Initialize timing from caption PTS when not set
- ccx_dtvcc.h: Added ccxr_dtvcc_set_active FFI declaration
- lib.rs: Added ccxr_dtvcc_set_active function to enable DTVCC decoder
- decoder/mod.rs: Fixed flush logic to always process visible windows
- ccx_demuxer_mxf.c: Fixed PTS calculation to use 90kHz units based on
  edit_rate, and changed verbose logging to debug()

Fixes #1647

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-04 09:53:30 +01:00
Kurma Ritish
0890e06d84 docs: add Windows WSL build instructions 2026-01-04 08:47:48 +00:00
Carlos Fernandez Sanz
8c33412888 Merge pull request #1971 from ujjwalr27/scc-accurate-timing
Tested with Broadcast Source sample from issue #1120. Pre-roll timing calculation works correctly, output structure matches broadcast reference patterns.
2026-01-03 21:50:43 +01:00
ujjwalr27
f40294cc5c minor fix 2026-01-03 23:38:16 +05:30
ujjwalr27
22d5d35158 Fix SCC accurate timing: separate load/display timestamps, skip clear commands, pass YouTube validation 2026-01-03 22:38:16 +05:30
Amrit kumar Mahto
51cae1c2f0 Fix OOB read/write and length handling in CEA-608/708 decoders 2026-01-03 17:42:38 +05:30
Carlos Fernandez Sanz
dfaebd5db8 Merge pull request #1968 from THE-Amrit-mahto-05/fix/dtvcc-critical-bugs
fix DTVCC: Heap Buffer Overflow & Out-of-Bounds Read
2026-01-03 11:54:19 +01:00
Carlos Fernandez Sanz
cfa7d912ca fix(rust): Flush stdout after print to fix stream mode display 2026-01-03 11:38:25 +01:00
Carlos Fernandez
ad971f0e72 fix(rust): Flush stdout after print to fix stream mode display
When using --input <format>, the startup output showed [Stream mode: ]
(empty) instead of showing the format name like [Stream mode: SCC].

Root cause: The Rust logger's print() function uses print!() which
doesn't automatically flush stdout. When mixing C and Rust code that
both write to stdout, the Rust output was getting buffered and not
appearing before the C code continued writing.

The fix adds explicit std::io::stdout().flush() after each print!()
call to ensure output appears immediately and interleaves correctly
with C code.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-02 23:22:46 +01:00
Carlos Fernandez Sanz
8aadbfb5f2 feat: Add --input scc option for SCC input format 2026-01-02 23:09:56 +01:00
Amrit kumar Mahto
44eb665cd8 chore: apply clang-format fixes 2026-01-03 03:12:19 +05:30
Amrit kumar Mahto
1255b318ae [FIX] Remove dead safety checks per reviewer feedback 2026-01-03 03:06:23 +05:30
Carlos Fernandez
1b0e66bc67 feat: Add --input scc option for SCC input format
Add support for `--input scc` command line option to explicitly specify
SCC (Scenarist Closed Caption) input format, for consistency with other
input format options.

Changes:
- Add `Scc` variant to `InFormat` enum in args.rs
- Handle `InFormat::Scc` in parser.rs to set StreamMode::Scc
- Add `StreamMode::Scc` case in print_cfg() in both Rust and C code

Fixes #1972

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-02 21:45:08 +01:00
Carlos Fernandez Sanz
f5dc1cf467 fix: Make --quiet flag work again 2026-01-02 21:35:42 +01:00
ujjwalr27
aaf937a135 Fix rustfmt style issues in lib_ccxr 2026-01-03 01:05:59 +05:30
ujjwalr27
317c66f14e Fix clang-format style issues 2026-01-03 01:02:19 +05:30
ujjwalr27
946c5859d4 Add --scc-accurate-timing option for bandwidth-aware SCC output (fixes #1120) 2026-01-03 00:28:16 +05:30
ujjwalr27
7166e48698 Add --scc-accurate-timing option for bandwidth-aware SCC output (fixes #1120) 2026-01-03 00:27:17 +05:30
Carlos Fernandez
d31ea87c03 fix: Make --quiet flag work again
The --quiet flag was broken due to two issues:

1. Inverted mapping in Rust FFI: The C→Rust constant mapping was wrong.
   CCX_MESSAGES_QUIET=0, CCX_MESSAGES_STDOUT=1, CCX_MESSAGES_STDERR=2
   but the Rust code mapped 0→Stdout, 1→Stderr, 2→Quiet.

2. Logger initialization timing: The Rust logger was initialized BEFORE
   command-line arguments were parsed, so --quiet had no effect.

Changes:
- Fix the OutputTarget mapping in ccxr_init_basic_logger()
- Add set_target() method to CCExtractorLogger
- Add ccxr_update_logger_target() to update logger after arg parsing
- Call ccxr_update_logger_target() after ccxr_parse_parameters()

Fixes #1956

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-02 19:49:06 +01:00
Amrit kumar Mahto
028ce9d0b5 [FIX] DTVCC: Heap Overflow & OOB Read 2026-01-02 18:33:26 +05:30
Amrit kumar Mahto
cc7a43b5e2 [FIX] Teletext decoder: fix OOB read/write and loop overflow (#1965) 2026-01-02 18:09:15 +05:30
Amrit kumar Mahto
3e1424cda8 Fix TS/ES: Integer overflow, stack overflow, heap over-read 2026-01-02 17:52:25 +05:30
Amrit kumar Mahto
82109e6cd9 Fix DTVCC structural type confusion and OOB writes (#1961) 2026-01-02 17:27:15 +05:30
Amrit kumar Mahto
5dc8292dd2 Fix out-of-bounds read in H.264 SEI parsing 2026-01-02 16:58:09 +05:30
Carlos Fernandez Sanz
a5b8bc8bf6 fix(rust): Update palette crate to 0.7 for Fedora compatibility 2026-01-02 10:00:00 +01:00
Rahul Tripathi
29158b2c38 Merge branch 'master' into final 2026-01-02 14:18:45 +05:30
Carlos Fernandez
ad2ee70743 fix(rust): Update palette crate to 0.7 for Fedora compatibility
The palette crate renamed `to_positive_degrees()` to `into_positive_degrees()`
in version 0.7.0. This was causing build failures on Fedora which uses
system-packaged Rust crates with newer versions.

Changes:
- Update palette dependency from 0.6.1 to 0.7
- Change method call from to_positive_degrees() to into_positive_degrees()

Fixes build failure reported in #1954.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-02 08:11:47 +01:00
Carlos Fernandez Sanz
562de8893b Merge pull request #1953 from THE-Amrit-mahto-05/fix/ts-heap-overflow
Fix/ts heap overflow
2026-01-02 08:09:39 +01:00
Carlos Fernandez Sanz
12adb5e92b fix(ci): Fix Windows CI cargo build cache path 2026-01-02 08:06:22 +01:00
Carlos Fernandez Sanz
203eb23030 fix(build): Support FFMPEG_INCLUDE_DIR on Linux for hardsubx 2026-01-02 08:02:46 +01:00
Amrit Kumar Mahto
774c3a0d3a Update CHANGES.TXT 2026-01-02 04:31:39 +05:30
Amrit Kumar Mahto
07f1ddc3fe Fix capbufsize and capbuflen assignments to use size_t 2026-01-02 04:26:23 +05:30
Carlos Fernandez
303bec8d5d fix(build): Support FFMPEG_INCLUDE_DIR on Linux for hardsubx
The FFMPEG_INCLUDE_DIR environment variable was only checked inside
the macOS-specific block, so it had no effect on Linux builds.

Changes:
- Move FFMPEG_INCLUDE_DIR check outside platform-specific blocks so
  it works on all platforms
- Add pkg-config fallback on Linux to automatically find FFmpeg
  include paths

This fixes compilation on systems like Fedora where FFmpeg headers
are installed in non-standard locations (e.g., /usr/include/ffmpeg).

Fixes #1954

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-01 23:24:44 +01:00
Amrit kumar Mahto
e43a6b5ced Fix TS Heap Buffer Overflow in copy_payload_to_capbuf (ts_functions.c) 2026-01-02 00:59:31 +05:30
Amrit kumar Mahto
64484af49e [FIX] Prevent stack buffer overflow in ISDB-CC decoder parse_csi 2026-01-02 00:40:07 +05:30
Amrit kumar Mahto
7526da884c Prevent integer overflow in EIA-608 screen buffer reallocation 2026-01-01 23:20:25 +05:30
Carlos Fernandez Sanz
3529bb29b4 fix(avc): Remove unnecessary TODO for idr_pic_id 2026-01-01 13:02:25 +01:00
Carlos Fernandez
925560f773 fix(avc): Remove unnecessary TODO for idr_pic_id
The idr_pic_id is read to advance the bitstream position (required for
correct parsing of subsequent fields), but the value itself is not
needed for caption extraction. CCExtractor uses pic_order_cnt_lsb for
frame ordering and PTS for timing - idr_pic_id serves no purpose here.

Closes #1895

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-01 12:58:55 +01:00
Carlos Fernandez
200eb1750a fix(ci): Fix Windows CI cargo build cache path
- Fix cargo build cache path: rust.bat sets CARGO_TARGET_DIR to the
  windows/ directory, which results in artifacts at
  windows/x86_64-pc-windows-msvc/, not windows/target/
- Remove redundant CARGO_TARGET_DIR from build steps since rust.bat
  overrides it anyway

Note: vcpkg.json builtin-baseline intentionally not changed to avoid
breaking transitive dependencies (libxml2 etc.)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-01 12:44:18 +01:00
Carlos Fernandez Sanz
6dcdb4b2d8 chore: Bump version to 0.96.4 2026-01-01 10:52:36 +01:00
Carlos Fernandez Sanz
a2d2c4f063 Merge branch 'master' into release/0.96.4 2026-01-01 10:39:12 +01:00
Carlos Fernandez
4ab6c83c27 chore: Bump version to 0.96.4
Update version numbers across all packaging and build files for the
0.96.4 release.

Changes in 0.96.4:
- New: Persistent CEA-708 decoder context
- New: OCR character blacklist options
- New: OCR line-split option
- Fix: 32-bit build failures (i686, armv7l)
- Fix: Legacy argument compatibility (-1, -2, -12, --sc, --svc)
- Fix: Prevent heap buffer overflow in Teletext (security)
- Fix: Lazy OCR initialization

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-01 10:17:56 +01:00
Carlos Fernandez Sanz
e66a0183c3 Merge pull request #1941 from Harshdhall01/cleanup-rust-todos
[RUST] Document EIA-708 buffer size and remove debug logging
2026-01-01 09:59:22 +01:00
Carlos Fernandez Sanz
a8ec28630a Merge pull request #1934 from THE-Amrit-mahto-05/fix/teletext-overflow
prevent heap buffer overflow in Teletext demux path
2026-01-01 09:53:01 +01:00
Carlos Fernandez Sanz
432d4237ec ci(windows): Optimize Windows build workflow for faster CI 2026-01-01 09:42:19 +01:00
Carlos Fernandez
e9519c4a67 fix(ci): Remove broken Chocolatey caching for GPAC
The Chocolatey cache only stored package metadata, not the actual
installed SDK files at C:\Program Files\GPAC\sdk\include. This caused
build failures when the cache hit but GPAC headers weren't available.

GPAC install is fast (~30s) so caching isn't worth the complexity.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-01 09:31:11 +01:00
Carlos Fernandez Sanz
fef005ddaf perf(dvb): Lazy OCR initialization for DVB subtitle decoder 2026-01-01 02:48:22 +01:00
Carlos Fernandez
546c776e57 ci(windows): Optimize Windows build workflow for faster CI
Major optimizations to reduce Windows build time from ~45 min to ~10 min:

1. **Single consolidated job** - Previously two parallel jobs (Release/Debug)
   duplicated the entire 34-minute vcpkg install. Now builds both
   configurations sequentially in one job, sharing all cached dependencies.

2. **lukka/run-vcpkg action** - Replaces manual git clone + bootstrap with
   the official vcpkg action that has built-in caching and better handling.

3. **Cache vcpkg installed packages** - Separately cache the installed/
   directory with hash-based keys for faster cache hits.

4. **Cargo caching** - Add caching for Rust registry and build artifacts,
   similar to the Linux build workflow.

5. **Chocolatey caching** - Cache gpac package to skip download on hits.

6. **Conditional installs** - Skip vcpkg install and choco install when
   cache is available.

7. **Updated Rust toolchain action** - Replace deprecated actions-rs/toolchain
   with dtolnay/rust-toolchain.

Expected improvements:
- Cold build: ~20 minutes (down from ~45 min)
- Warm build (cache hit): ~5-10 minutes

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-01 02:03:35 +01:00
Carlos Fernandez Sanz
daeed5df71 fix(args): Add legacy aliases for backwards compatibility 2026-01-01 01:49:59 +01:00
Carlos Fernandez
b56ab005a8 perf(dvb): Lazy OCR initialization for DVB subtitle decoder
Previously, Tesseract OCR was initialized eagerly when a DVB subtitle
stream was detected in the transport stream. This caused ~10 second
startup overhead even for files that:
- Have DVB streams but no actual bitmap subtitles
- Have DVB streams alongside CEA-608 text captions (which don't need OCR)
- Have DVB streams but the user only wants raw bitmap output

The initialization also created OpenMP worker threads that generated
hundreds of thousands of futex syscalls, causing valgrind tests to
take 15+ minutes instead of seconds.

This change defers OCR initialization until a DVB bitmap region actually
needs to be processed with OCR. Benefits:

- Files with DVB streams but no bitmap content: 10s → 0.1s
- Files with DVB + CEA-608 captions: 10s → 1-3s
- Valgrind test performance: 15+ min → seconds (no thread pool overhead
  when OCR isn't used)

The ocr_initialized flag ensures init_ocr() is called only once, on
first bitmap encounter.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-01 01:26:27 +01:00
Carlos Fernandez
f1681ee929 fix(args): Add support for legacy -1, -2, -12 numeric options
Map legacy CEA-608 field extraction options to their modern equivalent:
- -1  → --output-field=1 (extract field 1 only)
- -2  → --output-field=2 (extract field 2 only)
- -12 → --output-field=12 (extract both fields)

These options are documented in the help text and were commonly used
but stopped working after the Rust argument parser migration.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-01 01:02:54 +01:00
Carlos Fernandez
031f463b5c fix(args): Add legacy aliases for backwards compatibility
Add aliases for options that were commonly used with single-dash
or without hyphens in older versions of ccextractor:

- --parsePAT: add alias "pat" (for -pat)
- --parsePMT: add alias "pmt" (for -pmt)
- --no-teletext: add alias "noteletext" (for -noteletext)
- --no-rollup: add alias "noru" (for -noru)
- --no-bom: add alias "nobom" (for -nobom)
- --no-autotimeref: add alias "noautotimeref" (for -noautotimeref)
- --no-scte20: add alias "noscte20" (for -noscte20)

These aliases, combined with normalize_legacy_option() which converts
single-dash to double-dash (e.g., -noteletext -> --noteletext), allow
old scripts using legacy syntax to continue working.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-01 00:42:23 +01:00
Carlos Fernandez Sanz
b23866f5a8 feat(rust): Add persistent DtvccRust context for CEA-708 decoder 2026-01-01 00:21:40 +01:00
Carlos Fernandez
2ec93c3d3d fix(rust): Check dtvcc_rust instead of dtvcc in ccxr_process_cc_data
When Rust CEA-708 decoder is enabled, dec_ctx.dtvcc is set to NULL
and dec_ctx.dtvcc_rust holds the actual DtvccRust context. The null
check was incorrectly checking dtvcc, causing the function to return
early and skip all CEA-708 data processing.

This fixes tests 21, 31, 32, 105, 137, 141-149 which were failing
with exit code 10 (EXIT_NO_CAPTIONS) because no captions were being
extracted from CEA-708 streams.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-31 19:47:24 +01:00
Harshdhall01
5564aa8a54 Merge upstream/master and resolve CHANGES.TXT conflict 2025-12-31 23:51:24 +05:30
Harshdhall01
868fac5423 Update CHANGES.TXT with Rust documentation improvements 2025-12-31 23:33:49 +05:30
Harshdhall01
9ca26171d6 Document EIA-708 buffer size and remove debug logging
- Added documentation for EIA_708_BUFFER_LENGTH explaining that 2048 bytes
  is 16x the CEA-708 specification minimum of 128 bytes per service
- Removed debug logging of target address from target.rs as per TODO
- References CEA-708-E Section 8.4.3 for buffer specifications

Addresses two TODO items in the Rust codebase cleanup effort.
2025-12-31 23:24:39 +05:30
Carlos
ead4cbb278 fix(rust): remove double-increment of cb_708 counter
The cb_708 counter was being incremented twice for each CEA-708 data block:
1. In do_cb_dtvcc_rust() in Rust (src/rust/src/lib.rs)
2. In do_cb() in C (src/lib_ccx/ccx_decoders_common.c)

Since FTS calculation uses cb_708 (fts = fts_now + fts_global + cb_708 * 1001 / 30),
the double-increment caused timestamps to advance ~2x as fast as expected,
resulting in incorrect milliseconds in start timestamps.

This fix removes the increment from the Rust code since the C code already
handles it in do_cb().

Fixes timestamp issues reported in PR #1782 tests where start times like
00:00:20,688 were incorrectly output as 00:00:20,737.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-31 14:18:13 +01:00
Carlos
dfd7101f54 chore: Remove plan file from repo and add plans/ to .gitignore
- Move PLAN_PR1618_REIMPLEMENTATION.md to local plans/ folder
- Add plans/ to .gitignore to keep plans local

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-31 14:18:13 +01:00
Carlos
9659d3cf4c fix(rust): Use persistent DtvccRust context in ccxr_process_cc_data
The ccxr_process_cc_data function was still accessing dec_ctx.dtvcc
(which is NULL when Rust is enabled), causing a null pointer panic.

Changed to use dec_ctx.dtvcc_rust (the persistent DtvccRust context)
instead, which fixes the crash when processing CEA-708 data.

Added do_cb_dtvcc_rust() function that works with DtvccRust instead
of the old Dtvcc struct.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-31 14:18:13 +01:00
Carlos
34c7cd6d2e style(c): Fix clang-format issues in Phase 3 code
- Remove extra space before comment in ccx_decoders_common.c
- Fix comment indentation in mp4.c

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-31 14:16:31 +01:00
Carlos
7448a260c7 feat(c): Use Rust CEA-708 decoder in C code (Phase 3)
- init_cc_decode(): Initialize dtvcc_rust via ccxr_dtvcc_init()
- dinit_cc_decode(): Free dtvcc_rust via ccxr_dtvcc_free()
- flush_cc_decode(): Flush via ccxr_flush_active_decoders()
- general_loop.c: Set encoder via ccxr_dtvcc_set_encoder() (3 locations)
- mp4.c: Use ccxr_dtvcc_set_encoder() and ccxr_dtvcc_process_data()
- Add ccxr_dtvcc_is_active() declaration to ccx_dtvcc.h
- Fix clippy warnings in tv_screen.rs (unused assignments)
- All changes guarded with #ifndef DISABLE_RUST
- Update implementation plan to mark Phase 3 complete

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-31 14:16:31 +01:00
Carlos
54236f840c feat(c): Add C header declarations for Rust CEA-708 FFI (Phase 2)
- Add void *dtvcc_rust field to lib_cc_decode struct
- Declare ccxr_dtvcc_init, ccxr_dtvcc_free, ccxr_dtvcc_process_data in ccx_dtvcc.h
- Declare ccxr_dtvcc_set_encoder in lib_ccx.h
- Declare ccxr_flush_active_decoders in ccx_decoders_common.h
- All declarations guarded with #ifndef DISABLE_RUST
- Update implementation plan to mark Phase 2 complete

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-31 14:16:31 +01:00
Carlos Fernandez Sanz
f2aeef167b feat(ocr): Add character blacklist and line-split options for better accuracy 2025-12-31 14:16:15 +01:00
Carlos
6a4a1c97ec fix(rust): Address PR review - use existing DTVCC_MAX_SERVICES constant
- Remove duplicate CCX_DTVCC_MAX_SERVICES constant from decoder/mod.rs
- Import existing DTVCC_MAX_SERVICES from lib_ccxr::common
- Fix clippy uninlined_format_args warnings in avc/core.rs and decoder/mod.rs

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-31 14:15:29 +01:00
Carlos
f369959096 style(rust): Apply cargo fmt formatting
🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-31 14:15:29 +01:00
Carlos
1c2bcb5088 feat(rust): Add persistent DtvccRust context for CEA-708 decoder (Phase 1)
This is Phase 1 of the fix for issue #1499. It adds the Rust-side
infrastructure for a persistent CEA-708 decoder context without
modifying any C code, ensuring backward compatibility.

Problem:
The current Rust CEA-708 decoder creates a new Dtvcc struct on every
call to ccxr_process_cc_data(), causing all state to be reset. This
breaks stateful caption processing.

Solution:
Add a new DtvccRust struct that:
- Owns its decoder state (rather than borrowing from C)
- Persists across processing calls
- Is managed via FFI functions callable from C

Changes:
- Add DtvccRust struct in decoder/mod.rs with owned decoders
- Add CCX_DTVCC_MAX_SERVICES constant (63)
- Add FFI functions in lib.rs:
  - ccxr_dtvcc_init(): Create persistent context
  - ccxr_dtvcc_free(): Free context and all owned memory
  - ccxr_dtvcc_set_encoder(): Set encoder (not available at init)
  - ccxr_dtvcc_process_data(): Process CC data
  - ccxr_flush_active_decoders(): Flush all active decoders
  - ccxr_dtvcc_is_active(): Check if context is active
- Add unit tests for DtvccRust
- Use heap allocation for large structs to avoid stack overflow

The existing Dtvcc struct and ccxr_process_cc_data() remain unchanged
for backward compatibility. Phase 2-3 will add C header declarations
and modify C code to use the new functions.

Fixes: #1499 (partial)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-31 14:15:29 +01:00
Carlos Fernandez Sanz
da79ee44d9 fix(rust): Fix 32-bit build failures (i686, armv7l) 2025-12-31 13:16:17 +01:00
Carlos Fernandez Sanz
26434a7f89 fix(args): Add --sc alias for --sentencecap for backwards compatibility 2025-12-31 13:02:50 +01:00
Carlos Fernandez
718eb1a37f fix(args): Add --sc alias for --sentencecap for backwards compatibility
The -sc flag was used in older versions (0.94 and earlier) for sentence
capitalization. The Rust argument parser only accepts --sentencecap now.
This adds --sc as an alias to maintain backwards compatibility with
older documentation and user scripts.

Related to #1917

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-31 12:57:42 +01:00
Carlos Fernandez
ace6361bfb fix(rust): Fix armv7l build failure with 64-bit literal
The literal `0xcdcdcdcdcdcdcdcd` is a 64-bit value used as a "poison"
pattern to detect uninitialized pointers. On 32-bit systems like
armv7l, this causes a compile error because `usize` is only 32 bits.

The fix defines a platform-appropriate constant:
- 64-bit: 0xcdcdcdcdcdcdcdcd
- 32-bit: 0xcdcdcdcd

Fixes #1938

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-31 12:46:39 +01:00
Carlos Fernandez
7041441d39 fix(rust): Fix 32-bit x86 (i686) build failure
The code was using `std::arch::x86_64::*` unconditionally for both
x86 and x86_64 architectures. On 32-bit x86 (i686), the correct
module is `std::arch::x86`, not `std::arch::x86_64`.

This caused a build failure on i686:
  error[E0432]: unresolved import `std::arch::x86_64`

The fix uses separate conditional imports:
- `std::arch::x86::*` for 32-bit x86
- `std::arch::x86_64::*` for 64-bit x86_64

Both modules provide the same SSE2 intrinsics used by find_next_zero().

Fixes #1937

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-31 12:42:12 +01:00
Rahul-2k4
1589c31774 fix: Revert credits text deep-copy to fix CI startcredits regressions 2025-12-31 15:23:55 +05:30
Rahul-2k4
c96d3ff3f1 fix(encoder): Deep copy start/end credits text to prevent memory corruption
The start_credits_text and end_credits_text pointers were being copied
directly from the encoder config options, but free_encoder_context()
would later free them. This caused memory corruption when the pointers
referred to memory owned by ccx_options.

Now these strings are deep-copied in init_encoder() so each encoder
context owns its own copy, fixing the --startcreditstext regression.
2025-12-31 14:18:29 +05:30
Rahul-2k4
598a48e260 style: Apply clang-format to pass CI formatting check 2025-12-31 12:45:56 +05:30
Rahul-2k4
0cc3626261 ci: Trigger workflow run 2025-12-31 12:18:27 +05:30
Rahul-2k4
e0e66bd0ba style: Apply clang-format and update CHANGES.TXT
- Run clang-format on all source files to fix CI formatting check
- Add Issue #447 DVB multi-stream feature to CHANGES.TXT
2025-12-31 12:08:56 +05:30
Rahul-2k4
2642ca8805 Merge upstream/master into final branch
Resolves conflicts while preserving Issue #447 fix for DVB multi-stream handling:
- Kept DVB metadata update logic in ts_tables.c for split mode
- Adapted to upstream's single-param dvbsub_init_decoder signature
- Updated lib_ccx.c and general_loop.c to match new API
2025-12-31 11:42:08 +05:30
Rahul-2k4
a108302dc0 fix(dvb): Reinitialize decoder after PAT change for continuous extraction
After PAT changes, the pipeline's decoder was NULLed out to prevent
crashes, but this caused all subsequent DVB data to be skipped.

Now the decoder is reinitialized when detected as NULL, allowing
subtitle extraction to continue across PAT changes.
2025-12-31 11:19:56 +05:30
Rahul-2k4
ce90b61923 fix(dvb): Add NULL checks to prevent crash after PAT change
Fixes segmentation fault at 99% when PAT changes occur during DVB
subtitle processing. The crash happened because decoder context
private_data was freed but still accessed.

Changes:
- Add NULL check in process_data() before dvbsub_decode call
- Add defensive NULL check at start of dvbsub_decode()
- Add defensive NULL check at start of write_dvb_sub()
- Deep copy DVB bitmap data in copy_subtitle() to avoid aliasing
- Safe DVBSubContext copy that doesn't alias linked list pointers
- Clean up pipeline decoder refs in dinit_cap() after PAT change
- Direct FTS calculation for DVB-only streams

Tested with 11GB TS file with 23 PAT changes - no crash.
2025-12-31 10:44:00 +05:30
Rahul-2k4
18566f2213 fix(dvb): Improve multi-stream DVB subtitle handling for Issue #447
- Replace spin-lock with proper mutex (CRITICAL_SECTION/pthread_mutex)
- Add per-pipeline OCR contexts for thread safety
- Include PID in output filenames to handle duplicate languages
- Add dvbsub_get_context_size() and dvbsub_copy_context() for state management
- Improve language code validation (ISO 639-2 compliant)
- Change fatal error to warning for oversized PES packets
- Better language lookup from potential_streams before cinfo fallback
- Reset potential_stream data in demuxer cleanup
2025-12-30 21:58:40 +05:30
Amrit Kumar Mahto
125c5e8821 Update ts_functions.c 2025-12-30 15:13:19 +05:30
Carlos Fernandez Sanz
64ce4ac84f fix(args): Add --svc alias for --service for backwards compatibility 2025-12-30 09:49:44 +01:00
Carlos Fernandez
674b859284 fix(args): Add --svc alias for --service for backwards compatibility
The help text references -svc for CEA-708 service selection, but the
Rust argument parser only accepted --service. This adds --svc as an
alias to maintain backwards compatibility with older documentation
and user scripts.

Fixes #1917

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-30 09:30:15 +01:00
Carlos Fernandez Sanz
9a761331f8 Merge pull request #1905 from VS7686/fix-networking-warnings
The fix looks correct - properly adding `return;` after Rust calls to prevent the C code from also executing, and using `(void)` to silence return value warnings.

Windows CI passes (which was the target for this MSVC fix). The Linux CI failure appears unrelated since networking code isn't typically part of the regression test suite.

Merging - thanks for the fix!
2025-12-30 09:02:52 +01:00
Carlos Fernandez Sanz
046ee71eda Merge pull request #1921 from ChubbyChipmunk77/simplify-and-document
Excellent work addressing the feedback! The separation of CC_SOLID_BLANK and PARITY_BIT_MASK makes the code much clearer - even though they have the same value, they serve different purposes and that's now well-documented.

The additional documentation for validate_cc_pair is very helpful for understanding the CEA-608/708 validation logic.

Merging - thanks for the thorough fix!
2025-12-30 08:51:30 +01:00
Carlos Fernandez Sanz
b5fc3e63c4 Merge pull request #1924 from Harshdhall01/cleanup-vcl-hrd-todo
Looks good! The explanation is clearer and removing the dead code (commented exit) is a nice cleanup. Tests pass.

Merging - thanks!
2025-12-30 08:49:18 +01:00
VS7686
5eaf805d27 Add missing returns after Rust calls to prevent fallthrough 2025-12-30 09:20:59 +05:30
Amrit kumar Mahto
0ba941e8c0 ts: prevent heap buffer overflow in Teletext demux path 2025-12-30 07:13:04 +05:30
Carlos Fernandez Sanz
a9413a2312 fix(dvb): Enable OCR for all DVB subtitle streams, not just first 2025-12-29 23:09:18 +01:00
Carlos Fernandez Sanz
a2eb03cb73 docs: Add Windows package manager installation instructions 2025-12-29 23:04:41 +01:00
Carlos Fernandez
06063f26a4 docs: Add Windows package manager installation instructions
Add instructions for installing CCExtractor via:
- WinGet (winget install CCExtractor.CCExtractor)
- Chocolatey (choco install ccextractor)
- Scoop (scoop bucket add extras && scoop install ccextractor)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-29 22:56:45 +01:00
Carlos Fernandez Sanz
82daa7fb2b fix: Properly handle ATSC CC in private MPEG-2 streams 2025-12-29 22:55:00 +01:00
Carlos Fernandez
a71687e19f fix(dvb): Enable OCR for all DVB subtitle streams, not just first
Previously, the `initialized_ocr` flag was stored at the program level
and shared across all DVB subtitle streams within a program. This caused
OCR to only initialize for the first DVB stream, leaving subsequent
streams without an OCR context and unable to extract subtitles.

The fix removes the `initialized_ocr` flag entirely. Each DVB subtitle
decoder now gets its own OCR context, matching the behavior of DVD and
VOBSUB decoders which already worked correctly with multiple streams.

Test results with multi-language DVB sample:
- Before: Second stream (0xCE0) → "No captions were found"
- After: Second stream (0xCE0) → 5 subtitles extracted correctly

Fixes #1067

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-29 21:26:56 +01:00
Carlos Fernandez
25162fe40a chore: Add build directories to .gitignore
Add build_*/ pattern and linux/build_scan/ to ignore various build
output directories (build_ocr/, build_ocr_asan/, etc.)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-29 21:11:51 +01:00
Carlos Fernandez
3365a715a6 fix: Properly handle ATSC CC in private MPEG-2 streams
This commit fixes two issues:

1. ATSC CC data in private MPEG-2 streams (stream type 0x06) was not
   being processed. The code returned CCX_PRIVATE_MPEG2_CC buffer type
   which was never properly implemented - it just dumped debug output
   and returned placeholder bytes.

   Fix: Treat ATSC CC in private MPEG-2 streams the same as in
   user-private streams (0x80-0x8F) by returning CCX_PES buffer type.
   Both contain the same CC data format and should use the same
   processing path.

2. Several dump() calls were using CCX_DMT_GENERIC_NOTICES which is
   enabled by default, causing binary output to flood the terminal
   when processing certain files.

   Fix: Changed to appropriate debug-only masks (CCX_DMT_VERBOSE,
   CCX_DMT_PARSE) so binary dumps only appear when debug mode is
   explicitly enabled.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-29 21:10:11 +01:00
Carlos Fernandez Sanz
26e0f64720 fix(windows): Configure MSI as 64-bit installer 2025-12-29 20:25:41 +01:00
Carlos Fernandez
a1ed940c8b fix(build): Use -arch x64 flag for WiX build instead of Package attribute
The Platform attribute is not valid in WiX v4+. Instead, specify the
target architecture at build time using the -arch x64 flag.

Changes:
- Remove invalid Platform="x64" attribute from Package element
- Add -arch x64 to wix build command in release workflow
- Keep ProgramFiles64Folder for explicit 64-bit installation path

This ensures the MSI is built as a proper 64-bit package that installs
to "Program Files" instead of "Program Files (x86)".

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-29 19:03:04 +01:00
ChubbyChipmunk77
f5f4768503 style: fix doc comment formatting for Clippy 2025-12-29 22:01:07 +05:30
Carlos Fernandez
e4374204bd fix(windows): Configure MSI as 64-bit installer
Add Platform="x64" to the WiX Package element and use ProgramFiles64Folder
instead of ProgramFiles6432Folder to ensure the MSI:
- Is recognized as a 64-bit installer by tools like winget/komac
- Installs to "Program Files" instead of "Program Files (x86)"

This fixes winget manifest detection issues where the installer was
incorrectly identified as x86 architecture.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-29 17:05:51 +01:00
ChubbyChipmunk77
7f55ae5c1d Fixed semantic naming and update doc comments 2025-12-29 21:30:40 +05:30
Harshdhall01
8bf1bc16de Remove blank line to fix formatting check 2025-12-29 21:14:35 +05:30
Harshdhall01
5352a8b877 Fix formatting: use consistent tab indentation and remove trailing whitespace
- Line 908: Changed spaces+tabs to consistent tabs only
- Line 911: Removed trailing tabs on empty line
2025-12-29 21:05:17 +05:30
Carlos Fernandez Sanz
fd155285d2 0.96.3 2025-12-29 14:56:33 +01:00
Carlos Fernandez
a6fd8d468a chore: Bump version to 0.96.3
Update version number across all files:
- src/lib_ccx/lib_ccx.h (main version define)
- linux/configure.ac, mac/configure.ac (autoconf)
- OpenBSD/Makefile
- package_creators/ (PKGBUILD, ccextractor.spec, debian.sh)
- packaging/winget/ (all yaml manifests)
- packaging/chocolatey/ (nuspec and install script)

Note: Checksums in winget/chocolatey will need to be updated
when the actual release MSI is built.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-29 14:11:23 +01:00
Carlos Fernandez
5b05ce5073 docs: Add changelog entries for version 0.96.3
Document all changes since 0.96.2 including:
- VOBSUB subtitle extraction for MP4 and MKV files
- Native SCC input file support
- SCC output improvements (frame rate, styled PAC codes)
- Various bug fixes for timing, builds, and OCR

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-29 13:28:24 +01:00
Carlos Fernandez
d28bc4e114 style: Fix formatting issues in ocr.c and options.rs
- Use tabs for continuation indentation in C code (clang-format)
- Remove extra trailing spaces in Rust code (rustfmt)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-29 12:39:08 +01:00
Carlos Fernandez Sanz
285e81f9a7 Merge pull request #1898 from hridyasadanand/docs-remove-travis-badge
Good cleanup - removing the outdated Travis CI badge and adding a usage example helps new users. Merging.
2025-12-29 12:23:58 +01:00
Carlos Fernandez Sanz
730156f33b Merge pull request #1914 from VS7686/fix-epg-warnings
Clean fix for unused variable warnings. Verified locally. Merging.
2025-12-29 11:49:37 +01:00
Carlos Fernandez Sanz
152bbd308c Merge pull request #1922 from x15sr71/fix/utf8proc-include-path
Excellent fix! The `__has_include()` approach is clean and removes the symlink workaround.

Verified locally:
- Normal build: 
- `-system-libs` build: 

Merging.
2025-12-29 11:44:48 +01:00
Carlos Fernandez
8c586bccbd feat(ocr): Add character blacklist and line-split options for better accuracy
Add two new OCR options to improve subtitle recognition:

1. Character blacklist (enabled by default):
   - Blacklists characters |, \, `, _, ~ that are commonly misrecognized
   - Prevents "I" being recognized as "|" (pipe character)
   - Use --no-ocr-blacklist to disable if needed

2. Line-split mode (opt-in via --ocr-line-split):
   - Splits multi-line subtitle images into individual lines
   - Uses PSM 7 (single text line mode) for each line
   - Adds 10px padding around each line for better edge recognition
   - May improve accuracy for some VOBSUB subtitles

Test results with VOBSUB sample:
- Blacklist: Reduces pipe errors from 14 to 0
- Matches subtile-ocr's approach for preventing misrecognition

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-29 11:33:29 +01:00
Carlos Fernandez Sanz
434cd3959a fix(mp4): Use fixed-width integer types in bswap functions 2025-12-29 11:13:38 +01:00
Harshdhall01
3cb0f61b0c Clean up VCL HRD TODO comment
Replace unclear TODO with explanation of why VCL HRD parameters
are skipped. VCL HRD is for video buffering compliance and not
needed for caption extraction.

Changes:
- Replace TODO comment with clear explanation
- Update mprint message to be more informative
- Remove commented-out exit(1)

Addresses #1894
2025-12-29 15:01:40 +05:30
Chandragupt Singh
a18eaa2c96 fix: utf8proc include path for system library builds 2025-12-29 13:37:39 +05:30
Carlos Fernandez
69b7f9f4c3 fix(mp4): Use fixed-width integer types in bswap functions
Change bswap16 and bswap32 to use int16_t and int32_t instead of
short and long for consistent behavior across platforms.

On Windows x64, `long` is 4 bytes (LLP64 model), while on Linux x64
`long` is 8 bytes (LP64 model). This difference could cause
inconsistent NAL unit length parsing in MP4/MOV files, potentially
affecting timestamp calculations.

This fix ensures the byte-swapping functions work identically on
both platforms by using fixed-width integer types from <stdint.h>.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-29 08:52:33 +01:00
Carlos Fernandez Sanz
63dde6f3b2 feat(mp4): Add VOBSUB subtitle extraction with OCR for MP4 files 2025-12-29 08:47:33 +01:00
Carlos Fernandez
8f64eeb54f ci: Trigger CI tests
🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-28 19:57:11 +01:00
ChubbyChipmunk77
02d91c4a03 REFACTOR: 1. simplified verify_parity function. 2.Improved documentation for public function validate_cc_pair. 3. Added constant for 0x7F. 2025-12-29 00:00:38 +05:30
Carlos Fernandez
463a4a85a1 build(windows): Add vobsub_decoder to Windows build
Add vobsub_decoder.c and vobsub_decoder.h to the Visual Studio project
and filters files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-28 18:44:32 +01:00
Carlos Fernandez
ba2833b819 style: Fix clang-format indentation in vobsub_decoder.c
🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-28 17:49:34 +01:00
Carlos Fernandez
635a305c37 build: Add vobsub_decoder to autoconf build system
Add vobsub_decoder.c and vobsub_decoder.h to linux and mac Makefile.am
to fix autoconf build failures.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-28 17:42:08 +01:00
Carlos Fernandez
6fe612db3e fix: Guard ocr_text access with ENABLE_OCR preprocessor check
The ocr_text field in struct cc_bitmap is only defined when ENABLE_OCR
is set. Wrap the free() calls with #ifdef ENABLE_OCR to fix build
failures in non-OCR configurations.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-28 17:37:05 +01:00
Carlos Fernandez
2930c61420 feat(mp4): Add VOBSUB subtitle extraction with OCR for MP4 files
Add support for extracting VOBSUB (bitmap) subtitles from MP4 files
and converting them to text formats via OCR. This complements the
existing MKV VOBSUB support added in commit 1fccb783.

Changes:
- Add shared vobsub_decoder module for SPU parsing and OCR
- Add process_vobsub_track() function in mp4.c for subp:MPEG tracks
- Detect and count VOBSUB tracks in MP4 container
- Extract palette from decoder config when available
- Process SPU samples through OCR pipeline

The VOBSUB decoder module provides:
- SPU control sequence parsing (timing, colors, coordinates)
- RLE-encoded bitmap decoding (interlaced format)
- Palette parsing from idx header format
- Integration with Tesseract OCR via ocr_rect()

Tested with sample from issue #1349 - successfully extracted 61
subtitles from 128 SPU samples with accurate OCR text output.

Fixes #1349

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-28 17:32:24 +01:00
Carlos Fernandez Sanz
173db88dcf feat(matroska): Add VOBSUB subtitle extraction support for MKV files 2025-12-28 14:28:02 +01:00
VS7686
29c3f4e684 Trigger CI re-run 2 2025-12-28 18:04:30 +05:30
VS7686
d4a7b1d6ed Trigger CI re-run 2025-12-28 16:05:22 +05:30
Carlos Fernandez
9d14766b0d fix: Use #define instead of const int for VOBSUB_BLOCK_SIZE
MSVC doesn't support variable-length arrays (VLAs). The const int
declaration wasn't being treated as a compile-time constant,
causing Windows build failure with errors C2057, C2466, C2133.

Changed to #define which is a true compile-time constant.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-28 11:32:48 +01:00
Carlos Fernandez
6f2a73d706 docs: Add VOBSUB extraction documentation and subtile-ocr Dockerfile
- Add docs/VOBSUB.md explaining the VOBSUB extraction workflow
- Add tools/vobsubocr/Dockerfile for building subtile-ocr OCR tool
- Document how to convert VOBSUB (.idx/.sub) to SRT using OCR

The Dockerfile uses subtile-ocr (https://github.com/gwen-lg/subtile-ocr),
an actively maintained fork of vobsubocr with better accuracy.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-28 10:26:41 +01:00
Carlos Fernandez
1fccb783f2 feat(matroska): Add VOBSUB subtitle extraction support for MKV files
Previously, CCExtractor would only print "Error: VOBSUB not supported"
when encountering VOBSUB (S_VOBSUB) subtitle tracks in Matroska files.
This left users without any usable output.

This commit adds full VOBSUB extraction support:
- Generate proper .idx index files with timestamps and file positions
- Generate proper .sub files with PS-wrapped SPU data
- Correct PS Pack header with SCR derived from timestamps
- Correct PES header with PTS for each subtitle
- 2048-byte block alignment (standard VOBSUB format)

The output is compatible with VLC, FFmpeg, and other players that
support VobSub subtitle format.

Tested with sample from issue #1371 - output validates correctly
with FFprobe and produces identical subtitle data to mkvextract.

Fixes #1371

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-28 10:02:19 +01:00
Carlos Fernandez Sanz
ec30a79be9 fix(mp4): Fix 200ms timing offset for MOV/MP4 caption extraction 2025-12-28 09:37:46 +01:00
Carlos Fernandez Sanz
5beb4389f6 fix: Apply --delay option to DVB/bitmap subtitles 2025-12-28 09:36:59 +01:00
Carlos Fernandez
a6ccf29630 fix: Apply --delay option to DVB/bitmap subtitles
The --delay option was not being applied to DVB and other bitmap-based
subtitles (DVD subtitles, etc.), only to CEA-608 subtitles. This made
it impossible for users to correct timing offsets in DVB subtitle
extraction.

Changes:
- Add subs_delay to sub->start_time and sub->end_time for CC_BITMAP
  subtitles in encode_sub(), matching the behavior for CC_608
- Add bounds checking to skip subtitles that become negative after
  applying a negative delay
- Properly free bitmap data when skipping to avoid memory leaks

This provides a workaround for issue #1248 where DVB subtitles were
extracted with incorrect timing offset. Users can now use --delay to
adjust the timing.

Fixes #1248

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-28 07:58:58 +01:00
Carlos Fernandez Sanz
b6d7c7e778 feat(scc): Add configurable frame rate and styled PAC codes for SCC output 2025-12-28 06:54:29 +01:00
Rahul-2k4
117c2fce69 fix(dvb): Apply 3 code review fixes for Issue #447
- Fix escaped newline in debug print (dvb_subtitle_decoder.c:1861)
- Replace hardcoded PID 0x106 with 0 in debug calls (lines 1822, 1835)
- Accept uppercase letters in language code validation (ts_tables.c:396)
2025-12-28 11:06:31 +05:30
Rahul-2k4
ffd6a34c30 Fix Windows CI: change PlatformToolset from v145 to v143 for VS 2022 2025-12-28 10:34:46 +05:30
Rahul-2k4
70af627078 Fix syntax errors in lib_ccx.c: add missing ocr.h include and fix brace structure 2025-12-28 10:32:08 +05:30
Rahul-2k4
b0a5c069ed style: fix clang-format issues for Linux CI compatibility 2025-12-28 10:22:44 +05:30
Rahul-2k4
53ee63894c style: apply clang-format to fix CI formatting check 2025-12-28 10:12:40 +05:30
Rahul-2k4
50ece42e0a style: apply clang-format and normalize line endings to all source files 2025-12-28 00:47:25 +05:30
Rahul-2k4
3d00e718f6 style: normalize line endings and apply clang-format 2025-12-28 00:26:17 +05:30
Carlos Fernandez
021b788461 feat(scc): Add configurable frame rate and styled PAC codes for SCC output
This commit addresses the remaining items from issue #1191:

1. SCC Output Frame Rate:
   - Added scc_framerate to encoder_cfg and encoder_ctx structs
   - The --scc-framerate option now affects both input parsing AND output
   - Supports 24, 25, 29.97 (default), and 30 fps

2. Styled PAC (Preamble Address Code) Optimization:
   - Added support for styled PACs that encode color/font at column 0
   - When captions start at column 0 with non-default style, uses a single
     styled PAC instead of indent PAC + mid-row code
   - More efficient output that matches professional SCC files

Files changed:
- ccx_common_option.h/c: Added scc_framerate to encoder_cfg
- ccx_encoders_common.h/c: Added scc_framerate to encoder_ctx
- ccx_encoders_scc.c: Added get_scc_fps(), styled PAC functions,
  and optimized write_cc_buffer_as_scenarist()
- common.rs: Copy scc_framerate to enc_cfg

Fixes #1191

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-27 19:45:05 +01:00
Rahul-2k4
86e5d47141 style: apply clang-format to all source files 2025-12-28 00:14:16 +05:30
Rahul-2k4
5b36356456 style: apply clang-format fixes 2025-12-28 00:04:26 +05:30
Rahul-2k4
ba04aedae1 fix: add missing set_pipeline_pts and dump_rect_and_log functions 2025-12-27 23:58:26 +05:30
Rahul-2k4
5001df0d6c fix(rust): add missing lang field to cap_info initializer 2025-12-27 23:56:26 +05:30
Rahul-2k4
28506fee7b Add lang member to struct cap_info for DVB split mode 2025-12-27 23:49:29 +05:30
Rahul-2k4
47d8aaddb9 Merge upstream/master into final: Resolve conflicts in option structs (kept both split_dvb_subs and scc_framerate) 2025-12-27 23:34:40 +05:30
Rahul-2k4
1b2254f911 Fix DVB split output: include core logic handling and memory safety fixes 2025-12-27 23:27:36 +05:30
Rahul-2k4
dc34b26afb Fix DVB split output: handle empty PBUS and missing OCR init (Issue #447) 2025-12-27 23:21:08 +05:30
Carlos Fernandez
c06102678e fix(mp4): Fix 200ms timing offset for MOV/MP4 caption extraction
Set in_bufferdatatype for MP4/MOV container tracks to prevent incorrect
cb_field counter increments that were adding ~200ms to caption timestamps.

Root Cause:
-----------
The in_bufferdatatype variable was never set in mp4.c, remaining as
CCX_UNKNOWN. This caused the check in do_cb() (ccx_decoders_common.c)
to fail:

  if (ctx->in_bufferdatatype != CCX_H264 && ctx->in_bufferdatatype != CCX_PES)
      cb_field1++;

With in_bufferdatatype == CCX_UNKNOWN, cb_field1 was incremented for
each CEA-608 caption block processed. When get_fts() was called to
timestamp captions, it added cb_field1 * 1001/30 ms to the base time.

With ~6 caption blocks per frame (typical for roll-up captions), this
added approximately 200ms (6 × 33.37ms ≈ 200ms) to caption start times.

Analysis:
---------
Sample file: 1974a299f0502fc8199dabcaadb20e422e79df45972e554d58d1d025ef7d0686.mov

Before fix:
- FFmpeg first caption: 13,847ms
- CCExtractor first caption: 14,047ms
- Offset: 200ms late

The timing flow:
1. MP4 sample has PTS=1246245 (13,847ms at 90kHz)
2. set_fts() correctly sets fts_now based on PTS
3. do_cb() processes caption blocks, incrementing cb_field1 each time
4. get_fts() returns: fts_now + fts_global + cb_field1 * 1001/30
5. With cb_field1=6: adds 6 * 33.37 = 200ms offset

The fix ensures cb_field counters are not incremented for container
formats (MP4, MOV, MKV) because these formats associate all caption
data with the frame's PTS directly - there's no sub-frame timing.

Fix:
----
Set in_bufferdatatype in the three MP4 track processing functions:
- process_avc_track(): CCX_H264 for H.264/AVC tracks
- process_hevc_track(): CCX_H264 for H.265/HEVC tracks
- process_xdvb_track(): CCX_PES for MPEG-2 video tracks

After fix:
- FFmpeg first caption: 13,847ms
- CCExtractor first caption: 13,847ms
- Offset: 0ms (exact match)

This fix resolves timing issues for tests 226-230 on the sample platform.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-27 16:34:05 +01:00
Carlos Fernandez Sanz
b0800a112c feat(input): Add native SCC (Scenarist Closed Caption) input support 2025-12-27 16:16:31 +01:00
Carlos Fernandez
2b0d9ed427 chore: trigger CI rebuild
Timing issues in tests 226-230 are pre-existing and unrelated to SCC support.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-27 15:37:49 +01:00
Carlos Fernandez
fd4db0e7bf chore: Trigger CI re-run 2025-12-27 11:18:02 +01:00
VS7686
00d8c9cb0a Fix unused variable warnings in ts_tables_epg.c 2025-12-27 14:01:13 +05:30
Carlos Fernandez
7829c14c60 fix: Initialize scc_framerate in init_options()
The scc_framerate field was not being initialized in the C init_options()
function, leaving it with an undefined value. This could cause undefined
behavior when the options struct is used before the Rust code initializes
the field.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-27 08:38:32 +01:00
Rahul-2k4
d3602ec938 Fix: Defensive handling of invalid caption_field in DVB subtitle timing (fixes #447) 2025-12-27 12:48:28 +05:30
Rahul-2k4
f9b5e081a7 Remove duplicate comment in parser.rs 2025-12-27 11:46:24 +05:30
Rahul-2k4
bdc3eaa81b Fix: update Rust parser to allow text based formats for DVB split 2025-12-27 10:16:36 +05:30
Carlos Fernandez
2820042c1d style: Fix formatting and clippy warnings
- Replace tabs with spaces in doc comments
- Use #[derive(Default)] with #[default] attribute
- Use array syntax for char pattern matching
- Apply clang-format to C files

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-27 01:19:00 +01:00
Carlos Fernandez
d4d228125a feat(input): Add native SCC (Scenarist Closed Caption) input support
Add native support for reading SCC files directly, eliminating the need
for external conversion tools like SCC2RAW.exe or Perl scripts.

Implementation:
- New Rust parser module (src/rust/src/demuxer/scc.rs) with:
  - SMPTE timecode parsing (HH:MM:SS:FF format)
  - Configurable frame rates: 29.97 (default), 24, 25, 30 fps
  - CEA-608 hex pair extraction
  - UTF-8 BOM handling
  - 12 comprehensive unit tests
- Stream mode detection in both C and Rust code
- FFI exports for C integration (ccxr_is_scc_file, ccxr_process_scc)
- New --scc-framerate command line option
- Integration in raw_loop() following the McPoodle DVD raw pattern

Testing performed:
- Round-trip test: video → SRT, video → SCC, SCC → SRT
  Result: 118/118 captions matched (100% accuracy)
- Multiple output formats verified (SRT, WebVTT, transcript)
- Frame rate option tested with 24fps sample
- UTF-8 BOM handling verified
- All 260 Rust tests pass

Usage:
  ccextractor input.scc -o output.srt
  ccextractor input.scc --scc-framerate 25 -o output.srt

Closes #1293

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-27 00:54:44 +01:00
Rahul-2k4
43d5ba2f34 Improve error message for incompatible OutputFormat in Rust parser 2025-12-27 02:03:51 +05:30
Rahul-2k4
557774b202 Apply code style fixes from clang-format 2025-12-27 01:59:48 +05:30
Rahul-2k4
4e0472bddf Fix DVB split critical bugs: per-pipeline state separation and timing sync 2025-12-27 01:56:12 +05:30
Rahul-2k4
9a2fe6221e Switch platform toolset from v145 to v143 for GitHub Actions compatibility 2025-12-27 01:12:40 +05:30
Rahul Tripathi
182b23a283 Merge branch 'CCExtractor:master' into final 2025-12-27 00:13:39 +05:30
Rahul-2k4
77f3fd35f4 Fix #447: Resolve DVB split mode crash and routing logic
- Fixed NULL pointer dereference in dvb_subtitle_decoder.c (sub->prev check).
- Corrected logic in dvbsub_handle_display_segment to prevent dropped subtitles.
- Implemented robust encoder context swapping in general_loop.c for DVB streams.
- Added regression test: tests/regression/dvb_split.txt.
- Verified 100% completion in split mode and correct Teletext/DVB routing.
2025-12-27 00:11:09 +05:30
Carlos Fernandez Sanz
14e6919f2e ci: Add winget and Chocolatey packaging workflows 2025-12-26 18:20:55 +01:00
Carlos Fernandez
353a37010d ci: Add winget and Chocolatey packaging workflows
Add automated package publishing for Windows package managers:

## Winget
- Initial manifest files for CCExtractor.CCExtractor
- Workflow to auto-submit PRs to microsoft/winget-pkgs on release

## Chocolatey
- Package files (nuspec, install/uninstall scripts)
- Workflow to build and push packages on release

## Setup Required
- WINGET_TOKEN secret (GitHub PAT with public_repo scope)
- CHOCOLATEY_API_KEY secret (from chocolatey.org account)

Closes #1308

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-26 18:19:11 +01:00
Carlos Fernandez Sanz
921cbe0c57 ci(linux): Add workflow for system-libs builds 2025-12-26 18:08:11 +01:00
VS7686
f0523ceaa3 Fix logic error: removed early returns to restore C implementation 2025-12-26 21:44:12 +05:30
Carlos Fernandez
7284430fc6 fix(build): Preserve FFmpeg libs with -system-libs -hardsubx
The -system-libs mode was overwriting BLD_LINKER and losing the FFmpeg
libraries that -hardsubx adds. This fix preserves the FFmpeg libraries
when both flags are used together.

Also add permissions: contents: write to the workflow to allow
uploading assets to releases.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-26 15:49:51 +01:00
Carlos Fernandez
68d0d4094e ci(linux): Add workflow for system-libs builds
Add a new GitHub Actions workflow that builds CCExtractor using the
-system-libs flag, creating binaries that dynamically link against
system libraries instead of bundling dependencies.

This is useful for:
- Linux distribution packaging (Debian, Ubuntu, Fedora, etc.)
- Homebrew/Linuxbrew packaging
- Users who prefer smaller binaries with system library updates

Two variants are built:
- basic: Standard OCR-enabled build
- hardsubx: Build with HardSubX (burned-in subtitle extraction)

The workflow runs on releases and can be manually triggered.

Related to #1907

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-26 15:38:41 +01:00
Carlos Fernandez Sanz
7075f6291d build(linux): Add -system-libs flag for package manager compatibility 2025-12-26 15:32:32 +01:00
Carlos Fernandez Sanz
170d769476 Merge branch 'master' into build/linux-system-libs-flag 2025-12-26 15:31:31 +01:00
Carlos Fernandez
1ff3457744 Updated CHANGES.TXT for 0.96.2 2025-12-26 15:27:02 +01:00
Carlos Fernandez Sanz
dc352a2202 fix(windows): Bundle tessdata for OCR support out of the box 2025-12-26 15:23:34 +01:00
Chandragupt Singh
c8750e42d1 build(linux): use pkg-config cflags for system-libs includes 2025-12-26 18:51:16 +05:30
Carlos Fernandez
20448bfeb2 fix(windows): Bundle tessdata for OCR support out of the box
The Windows release was missing Tesseract OCR runtime dependencies
(tessdata files) needed for the HardSubx feature to work. Users had
to manually install Tesseract OCR and set TESSDATA_PREFIX.

Changes:
- Add get_executable_directory() to ocr.c that returns the directory
  containing the executable (works on Windows, Linux, and macOS)
- Update probe_tessdata_location() to search for tessdata in the
  executable directory, enabling bundled tessdata to be found
- Update release workflow to download eng.traineddata and osd.traineddata
  from tesseract-ocr/tessdata_fast during release builds
- Update WiX installer to include tessdata directory with the
  traineddata files

Now the Windows release includes tessdata files, and CCExtractor will
automatically find them in the installation directory without requiring
users to install Tesseract separately or set environment variables.

Fixes #1578

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-26 13:05:46 +01:00
VS7686
807df0339e Fix styling: Apply clang-format 2025-12-26 16:11:57 +05:30
Rahul-2k4
6642973c63 CLI + option plumbing for --split-dvb-subs 2025-12-26 14:43:36 +05:30
Chandragupt Singh
f08fd658e6 build(linux): Add -system-libs flag for Homebrew compatibility 2025-12-26 13:07:09 +05:30
VS7686
5ae3116a6c Fix indentation: reduce to 4 spaces 2025-12-26 10:29:36 +05:30
VS7686
826afcd991 Fix styling: increase indentation inside ifndef 2025-12-26 10:14:18 +05:30
VS7686
46af5ce9bb Fix coding style and formatting 2025-12-26 09:59:38 +05:30
VS7686
123b35ae69 Fix coding style and formatting 2025-12-26 09:49:17 +05:30
Carlos Fernandez Sanz
f6e9d55838 fix(release): Update Flutter GUI files and add versioned filenames 2025-12-25 22:34:24 +01:00
VS7686
6f7d3f6169 Fix C4098 warnings in networking.c 2025-12-26 00:26:11 +05:30
Carlos Fernandez
07cc78c2f1 feat(release): Add version numbers to release asset filenames 2025-12-25 16:36:18 +01:00
Carlos Fernandez
affa34848c fix(installer): Update Flutter GUI files for v0.7.0 2025-12-25 13:47:57 +01:00
Carlos Fernandez Sanz
45ee03aecc fix(release): Support 3-part version numbers (e.g., v0.96.1) 2025-12-25 12:58:04 +01:00
Carlos Fernandez
c6e27ca809 fix(release): Support 3-part version numbers (e.g., v0.96.1)
Update the version extraction logic in the release workflow to properly
handle 3-part semantic versions like v0.96.1 in addition to existing
2-part versions like v0.96.

MSI installers require 4-part versions (major.minor.build.revision):
- v0.96 → 0.96.0.0 (unchanged behavior)
- v0.96.1 → 0.96.1.0 (new support)
- v0.96.1.2 → 0.96.1.2 (passthrough)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-25 12:56:13 +01:00
Hridya
857a3bc9c6 docs: add basic usage example to documentation 2025-12-25 13:53:15 +05:30
Hridya
c2c589d6f6 docs: remove outdated Travis CI badge from README 2025-12-25 12:44:02 +05:30
GAURAV KARMAKAR
e42bc2b9f9 fixed the merged conflict in the ccx_encoders_common.h 2025-12-24 02:25:53 +05:30
Gaurav karmakar
bf9841a255 Merge branch 'master' into gaurav-v1 2025-12-24 01:55:53 +05:30
GAURAV KARMAKAR
6ed09ea397 SPUPNG: fix formatting to match clang-format 2025-12-22 13:22:25 +05:30
GAURAV KARMAKAR
2b708c4a31 Enhance SPUPNG offset calculations and XML tag handling in EIA608 encoder
- Introduced a forward declaration for .
- Updated  to calculate and set image dimensions before writing XML tags.
- Adjusted offset calculations based on screen size for better alignment of subtitles.
- Improved handling of the opening XML tag based on subtitle data presence.
2025-12-21 19:20:28 +05:30
GAURAV KARMAKAR
609a53f373 [BUG] -out=spupng with EIA608/teletext: offset values in XML may be not correct #893 2025-12-19 13:27:08 +05:30
121 changed files with 8335 additions and 1184 deletions

283
.github/workflows/build_deb.yml vendored Normal file
View File

@@ -0,0 +1,283 @@
name: Build Linux .deb Package
on:
# Build on releases
release:
types: [published]
# Allow manual trigger
workflow_dispatch:
inputs:
build_type:
description: 'Build type (all, basic, hardsubx)'
required: false
default: 'all'
# Build on pushes to workflow file for testing
push:
paths:
- '.github/workflows/build_deb.yml'
jobs:
build-deb:
runs-on: ubuntu-24.04
strategy:
fail-fast: false
matrix:
build_type: [basic, hardsubx]
steps:
- name: Check if should build this variant
id: should_build
run: |
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
INPUT_TYPE="${{ github.event.inputs.build_type }}"
if [ "$INPUT_TYPE" = "all" ] || [ "$INPUT_TYPE" = "${{ matrix.build_type }}" ]; then
echo "should_build=true" >> $GITHUB_OUTPUT
else
echo "should_build=false" >> $GITHUB_OUTPUT
fi
else
echo "should_build=true" >> $GITHUB_OUTPUT
fi
- name: Checkout repository
if: steps.should_build.outputs.should_build == 'true'
uses: actions/checkout@v6
- name: Get version
if: steps.should_build.outputs.should_build == 'true'
id: version
run: |
# Extract version from source or use tag
if [ "${{ github.event_name }}" = "release" ]; then
VERSION="${{ github.event.release.tag_name }}"
VERSION="${VERSION#v}" # Remove 'v' prefix if present
else
# Extract version from lib_ccx.h (e.g., #define VERSION "0.96.5")
VERSION=$(grep -oP '#define VERSION "\K[^"]+' src/lib_ccx/lib_ccx.h || echo "0.96")
fi
echo "version=$VERSION" >> $GITHUB_OUTPUT
echo "Building version: $VERSION"
- name: Install base dependencies
if: steps.should_build.outputs.should_build == 'true'
run: |
sudo apt-get update
sudo apt-get install -y --no-install-recommends \
build-essential \
cmake \
pkg-config \
zlib1g-dev \
libpng-dev \
libjpeg-dev \
libfreetype-dev \
libxml2-dev \
libcurl4-gnutls-dev \
libssl-dev \
clang \
libclang-dev \
tesseract-ocr \
libtesseract-dev \
libleptonica-dev \
patchelf
- name: Install FFmpeg dependencies (HardSubX)
if: steps.should_build.outputs.should_build == 'true' && matrix.build_type == 'hardsubx'
run: |
sudo apt-get install -y --no-install-recommends \
libavcodec-dev \
libavformat-dev \
libavutil-dev \
libswscale-dev \
libswresample-dev \
libavfilter-dev \
libavdevice-dev
- name: Install Rust toolchain
if: steps.should_build.outputs.should_build == 'true'
uses: dtolnay/rust-toolchain@stable
- name: Cache GPAC build
if: steps.should_build.outputs.should_build == 'true'
id: cache-gpac
uses: actions/cache@v5
with:
path: ~/gpac-install
key: gpac-abi-16.4-ubuntu24-deb
- name: Build GPAC
if: steps.should_build.outputs.should_build == 'true' && steps.cache-gpac.outputs.cache-hit != 'true'
run: |
git clone -b abi-16.4 --depth 1 https://github.com/gpac/gpac
cd gpac
./configure --prefix=/usr
make -j$(nproc)
make DESTDIR=$HOME/gpac-install install-lib
- name: Install GPAC to system
if: steps.should_build.outputs.should_build == 'true'
run: |
sudo cp -r $HOME/gpac-install/usr/lib/* /usr/lib/
sudo cp -r $HOME/gpac-install/usr/include/* /usr/include/
sudo ldconfig
- name: Build CCExtractor
if: steps.should_build.outputs.should_build == 'true'
run: |
mkdir build && cd build
if [ "${{ matrix.build_type }}" = "hardsubx" ]; then
cmake ../src -DCMAKE_BUILD_TYPE=Release -DWITH_OCR=ON -DWITH_HARDSUBX=ON
else
cmake ../src -DCMAKE_BUILD_TYPE=Release -DWITH_OCR=ON
fi
make -j$(nproc)
- name: Test build
if: steps.should_build.outputs.should_build == 'true'
run: ./build/ccextractor --version
- name: Create .deb package structure
if: steps.should_build.outputs.should_build == 'true'
run: |
VERSION="${{ steps.version.outputs.version }}"
VARIANT="${{ matrix.build_type }}"
if [ "$VARIANT" = "basic" ]; then
PKG_NAME="ccextractor_${VERSION}_amd64"
else
PKG_NAME="ccextractor-${VARIANT}_${VERSION}_amd64"
fi
mkdir -p ${PKG_NAME}/DEBIAN
mkdir -p ${PKG_NAME}/usr/bin
mkdir -p ${PKG_NAME}/usr/lib/ccextractor
mkdir -p ${PKG_NAME}/usr/share/doc/ccextractor
mkdir -p ${PKG_NAME}/usr/share/man/man1
# Copy binary
cp build/ccextractor ${PKG_NAME}/usr/bin/
# Copy GPAC library
cp $HOME/gpac-install/usr/lib/libgpac.so* ${PKG_NAME}/usr/lib/ccextractor/
# Set rpath so ccextractor finds bundled libgpac
patchelf --set-rpath '/usr/lib/ccextractor:$ORIGIN/../lib/ccextractor' ${PKG_NAME}/usr/bin/ccextractor
# Copy documentation
cp docs/CHANGES.TXT ${PKG_NAME}/usr/share/doc/ccextractor/changelog
cp LICENSE.txt ${PKG_NAME}/usr/share/doc/ccextractor/copyright
gzip -9 -n ${PKG_NAME}/usr/share/doc/ccextractor/changelog
# Generate man page
help2man --no-info --name="closed captions and teletext subtitle extractor" \
./build/ccextractor > ${PKG_NAME}/usr/share/man/man1/ccextractor.1 2>/dev/null || true
if [ -f ${PKG_NAME}/usr/share/man/man1/ccextractor.1 ]; then
gzip -9 -n ${PKG_NAME}/usr/share/man/man1/ccextractor.1
fi
# Create control file
if [ "$VARIANT" = "basic" ]; then
PKG_DESCRIPTION="CCExtractor - closed captions and teletext subtitle extractor"
else
PKG_DESCRIPTION="CCExtractor (with HardSubX) - closed captions and teletext subtitle extractor"
fi
INSTALLED_SIZE=$(du -sk ${PKG_NAME}/usr | cut -f1)
# Determine dependencies based on build variant (Ubuntu 24.04)
if [ "$VARIANT" = "hardsubx" ]; then
DEPENDS="libc6, libtesseract5, liblept5, libcurl3t64-gnutls, libavcodec60, libavformat60, libavutil58, libswscale7, libavdevice60, libswresample4, libavfilter9"
else
DEPENDS="libc6, libtesseract5, liblept5, libcurl3t64-gnutls"
fi
cat > ${PKG_NAME}/DEBIAN/control << CTRL
Package: ccextractor
Version: ${VERSION}
Section: utils
Priority: optional
Architecture: amd64
Installed-Size: ${INSTALLED_SIZE}
Depends: ${DEPENDS}
Maintainer: CCExtractor Development Team <carlos@ccextractor.org>
Homepage: https://www.ccextractor.org
Description: ${PKG_DESCRIPTION}
CCExtractor is a tool that extracts closed captions and teletext subtitles
from video files and streams. It supports a wide variety of input formats
including MPEG, H.264/AVC, H.265/HEVC, MP4, MKV, WTV, and transport streams.
.
This package includes a bundled GPAC library for MP4 support.
CTRL
# Remove leading spaces from control file
sed -i 's/^ //' ${PKG_NAME}/DEBIAN/control
# Create postinst to update library cache
cat > ${PKG_NAME}/DEBIAN/postinst << 'POSTINST'
#!/bin/sh
set -e
ldconfig
POSTINST
chmod 755 ${PKG_NAME}/DEBIAN/postinst
# Create postrm to update library cache
cat > ${PKG_NAME}/DEBIAN/postrm << 'POSTRM'
#!/bin/sh
set -e
ldconfig
POSTRM
chmod 755 ${PKG_NAME}/DEBIAN/postrm
# Set permissions
chmod 755 ${PKG_NAME}/usr/bin/ccextractor
chmod 755 ${PKG_NAME}/usr/lib/ccextractor
find ${PKG_NAME}/usr/lib/ccextractor -name "*.so*" -exec chmod 644 {} \;
# Build the .deb
dpkg-deb --build --root-owner-group ${PKG_NAME}
echo "deb_name=${PKG_NAME}.deb" >> $GITHUB_OUTPUT
- name: Test .deb package
if: steps.should_build.outputs.should_build == 'true'
run: |
VERSION="${{ steps.version.outputs.version }}"
VARIANT="${{ matrix.build_type }}"
if [ "$VARIANT" = "basic" ]; then
PKG_NAME="ccextractor_${VERSION}_amd64"
else
PKG_NAME="ccextractor-${VARIANT}_${VERSION}_amd64"
fi
# Install and test (apt handles dependencies automatically)
sudo apt-get update
sudo apt-get install -y ./${PKG_NAME}.deb
ccextractor --version
- name: Get .deb filename
if: steps.should_build.outputs.should_build == 'true'
id: deb_name
run: |
VERSION="${{ steps.version.outputs.version }}"
VARIANT="${{ matrix.build_type }}"
if [ "$VARIANT" = "basic" ]; then
echo "name=ccextractor_${VERSION}_amd64.deb" >> $GITHUB_OUTPUT
else
echo "name=ccextractor-${VARIANT}_${VERSION}_amd64.deb" >> $GITHUB_OUTPUT
fi
- name: Upload .deb artifact
if: steps.should_build.outputs.should_build == 'true'
uses: actions/upload-artifact@v6
with:
name: ${{ steps.deb_name.outputs.name }}
path: ${{ steps.deb_name.outputs.name }}
- name: Upload to Release
if: steps.should_build.outputs.should_build == 'true' && github.event_name == 'release'
uses: softprops/action-gh-release@v2
with:
files: ${{ steps.deb_name.outputs.name }}
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

275
.github/workflows/build_deb_debian13.yml vendored Normal file
View File

@@ -0,0 +1,275 @@
name: Build Debian 13 .deb Package
on:
# Build on releases
release:
types: [published]
# Allow manual trigger
workflow_dispatch:
inputs:
build_type:
description: 'Build type (all, basic, hardsubx)'
required: false
default: 'all'
# Build on pushes to workflow file for testing
push:
paths:
- '.github/workflows/build_deb_debian13.yml'
jobs:
build-deb:
runs-on: ubuntu-latest
container:
image: debian:trixie
strategy:
fail-fast: false
matrix:
build_type: [basic, hardsubx]
steps:
- name: Check if should build this variant
id: should_build
run: |
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
INPUT_TYPE="${{ github.event.inputs.build_type }}"
if [ "$INPUT_TYPE" = "all" ] || [ "$INPUT_TYPE" = "${{ matrix.build_type }}" ]; then
echo "should_build=true" >> $GITHUB_OUTPUT
else
echo "should_build=false" >> $GITHUB_OUTPUT
fi
else
echo "should_build=true" >> $GITHUB_OUTPUT
fi
- name: Install git and dependencies for checkout
if: steps.should_build.outputs.should_build == 'true'
run: |
apt-get update
apt-get install -y git ca-certificates
- name: Checkout repository
if: steps.should_build.outputs.should_build == 'true'
uses: actions/checkout@v6
- name: Get version
if: steps.should_build.outputs.should_build == 'true'
id: version
run: |
# Extract version from source or use tag
if [ "${{ github.event_name }}" = "release" ]; then
VERSION="${{ github.event.release.tag_name }}"
VERSION="${VERSION#v}" # Remove 'v' prefix if present
else
# Extract version from lib_ccx.h (e.g., #define VERSION "0.96.5")
VERSION=$(grep -oP '#define VERSION "\K[^"]+' src/lib_ccx/lib_ccx.h || echo "0.96")
fi
echo "version=$VERSION" >> $GITHUB_OUTPUT
echo "Building version: $VERSION"
- name: Install base dependencies
if: steps.should_build.outputs.should_build == 'true'
run: |
apt-get install -y --no-install-recommends \
build-essential \
cmake \
pkg-config \
zlib1g-dev \
libpng-dev \
libjpeg-dev \
libfreetype-dev \
libxml2-dev \
libcurl4-gnutls-dev \
libssl-dev \
clang \
libclang-dev \
tesseract-ocr \
libtesseract-dev \
libleptonica-dev \
patchelf \
curl
- name: Install FFmpeg dependencies (HardSubX)
if: steps.should_build.outputs.should_build == 'true' && matrix.build_type == 'hardsubx'
run: |
apt-get install -y --no-install-recommends \
libavcodec-dev \
libavformat-dev \
libavutil-dev \
libswscale-dev \
libswresample-dev \
libavfilter-dev \
libavdevice-dev
- name: Install Rust toolchain
if: steps.should_build.outputs.should_build == 'true'
run: |
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
- name: Build GPAC
if: steps.should_build.outputs.should_build == 'true'
run: |
git clone -b abi-16.4 --depth 1 https://github.com/gpac/gpac
cd gpac
./configure --prefix=/usr
make -j$(nproc)
make install-lib
ldconfig
- name: Build CCExtractor
if: steps.should_build.outputs.should_build == 'true'
run: |
export PATH="$HOME/.cargo/bin:$PATH"
mkdir build && cd build
if [ "${{ matrix.build_type }}" = "hardsubx" ]; then
cmake ../src -DCMAKE_BUILD_TYPE=Release -DWITH_OCR=ON -DWITH_HARDSUBX=ON
else
cmake ../src -DCMAKE_BUILD_TYPE=Release -DWITH_OCR=ON
fi
make -j$(nproc)
- name: Test build
if: steps.should_build.outputs.should_build == 'true'
run: ./build/ccextractor --version
- name: Create .deb package structure
if: steps.should_build.outputs.should_build == 'true'
id: create_deb
run: |
VERSION="${{ steps.version.outputs.version }}"
VARIANT="${{ matrix.build_type }}"
if [ "$VARIANT" = "basic" ]; then
PKG_NAME="ccextractor_${VERSION}_debian13_amd64"
else
PKG_NAME="ccextractor-${VARIANT}_${VERSION}_debian13_amd64"
fi
mkdir -p ${PKG_NAME}/DEBIAN
mkdir -p ${PKG_NAME}/usr/bin
mkdir -p ${PKG_NAME}/usr/lib/ccextractor
mkdir -p ${PKG_NAME}/usr/share/doc/ccextractor
mkdir -p ${PKG_NAME}/usr/share/man/man1
# Copy binary
cp build/ccextractor ${PKG_NAME}/usr/bin/
# Copy GPAC library
cp /usr/lib/libgpac.so* ${PKG_NAME}/usr/lib/ccextractor/
# Set rpath so ccextractor finds bundled libgpac
patchelf --set-rpath '/usr/lib/ccextractor:$ORIGIN/../lib/ccextractor' ${PKG_NAME}/usr/bin/ccextractor
# Copy documentation
cp docs/CHANGES.TXT ${PKG_NAME}/usr/share/doc/ccextractor/changelog
cp LICENSE.txt ${PKG_NAME}/usr/share/doc/ccextractor/copyright
gzip -9 -n ${PKG_NAME}/usr/share/doc/ccextractor/changelog
# Create control file
if [ "$VARIANT" = "basic" ]; then
PKG_DESCRIPTION="CCExtractor - closed captions and teletext subtitle extractor"
else
PKG_DESCRIPTION="CCExtractor (with HardSubX) - closed captions and teletext subtitle extractor"
fi
INSTALLED_SIZE=$(du -sk ${PKG_NAME}/usr | cut -f1)
# Determine dependencies based on build variant (Debian 13 Trixie)
if [ "$VARIANT" = "hardsubx" ]; then
DEPENDS="libc6, libtesseract5, libleptonica6, libcurl3t64-gnutls, libavcodec61, libavformat61, libavutil59, libswscale8, libavdevice61, libswresample5, libavfilter10"
else
DEPENDS="libc6, libtesseract5, libleptonica6, libcurl3t64-gnutls"
fi
cat > ${PKG_NAME}/DEBIAN/control << CTRL
Package: ccextractor
Version: ${VERSION}
Section: utils
Priority: optional
Architecture: amd64
Installed-Size: ${INSTALLED_SIZE}
Depends: ${DEPENDS}
Maintainer: CCExtractor Development Team <carlos@ccextractor.org>
Homepage: https://www.ccextractor.org
Description: ${PKG_DESCRIPTION}
CCExtractor is a tool that extracts closed captions and teletext subtitles
from video files and streams. It supports a wide variety of input formats
including MPEG, H.264/AVC, H.265/HEVC, MP4, MKV, WTV, and transport streams.
.
This package includes a bundled GPAC library for MP4 support.
Built for Debian 13 (Trixie).
CTRL
# Remove leading spaces from control file
sed -i 's/^ //' ${PKG_NAME}/DEBIAN/control
# Create postinst to update library cache
cat > ${PKG_NAME}/DEBIAN/postinst << 'POSTINST'
#!/bin/sh
set -e
ldconfig
POSTINST
chmod 755 ${PKG_NAME}/DEBIAN/postinst
# Create postrm to update library cache
cat > ${PKG_NAME}/DEBIAN/postrm << 'POSTRM'
#!/bin/sh
set -e
ldconfig
POSTRM
chmod 755 ${PKG_NAME}/DEBIAN/postrm
# Set permissions
chmod 755 ${PKG_NAME}/usr/bin/ccextractor
chmod 755 ${PKG_NAME}/usr/lib/ccextractor
find ${PKG_NAME}/usr/lib/ccextractor -name "*.so*" -exec chmod 644 {} \;
# Build the .deb
dpkg-deb --build --root-owner-group ${PKG_NAME}
echo "deb_name=${PKG_NAME}.deb" >> $GITHUB_OUTPUT
- name: Test .deb package
if: steps.should_build.outputs.should_build == 'true'
run: |
VERSION="${{ steps.version.outputs.version }}"
VARIANT="${{ matrix.build_type }}"
if [ "$VARIANT" = "basic" ]; then
PKG_NAME="ccextractor_${VERSION}_debian13_amd64"
else
PKG_NAME="ccextractor-${VARIANT}_${VERSION}_debian13_amd64"
fi
# Install and test (apt handles dependencies automatically)
apt-get update
apt-get install -y ./${PKG_NAME}.deb
ccextractor --version
- name: Get .deb filename
if: steps.should_build.outputs.should_build == 'true'
id: deb_name
run: |
VERSION="${{ steps.version.outputs.version }}"
VARIANT="${{ matrix.build_type }}"
if [ "$VARIANT" = "basic" ]; then
echo "name=ccextractor_${VERSION}_debian13_amd64.deb" >> $GITHUB_OUTPUT
else
echo "name=ccextractor-${VARIANT}_${VERSION}_debian13_amd64.deb" >> $GITHUB_OUTPUT
fi
- name: Upload .deb artifact
if: steps.should_build.outputs.should_build == 'true'
uses: actions/upload-artifact@v6
with:
name: ${{ steps.deb_name.outputs.name }}
path: ${{ steps.deb_name.outputs.name }}
- name: Upload to Release
if: steps.should_build.outputs.should_build == 'true' && github.event_name == 'release'
uses: softprops/action-gh-release@v2
with:
files: ${{ steps.deb_name.outputs.name }}
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

View File

@@ -8,6 +8,8 @@ on:
- 'docker/**'
- '**.c'
- '**.h'
- '**CMakeLists.txt'
- '**.cmake'
- 'src/rust/**'
pull_request:
types: [opened, synchronize, reopened]
@@ -16,6 +18,8 @@ on:
- 'docker/**'
- '**.c'
- '**.h'
- '**CMakeLists.txt'
- '**.cmake'
- 'src/rust/**'
jobs:

View File

@@ -7,6 +7,8 @@ on:
- '.github/workflows/build_linux.yml'
- '**.c'
- '**.h'
- '**CMakeLists.txt'
- '**.cmake'
- '**Makefile**'
- 'linux/**'
- 'package_creators/**'
@@ -17,6 +19,8 @@ on:
- '.github/workflows/build_linux.yml'
- '**.c'
- '**.h'
- '**CMakeLists.txt'
- '**.cmake'
- '**Makefile**'
- 'linux/**'
- 'package_creators/**'

View File

@@ -0,0 +1,154 @@
name: Build Linux (System Libs)
on:
# Build on releases
release:
types: [published]
# Allow manual trigger
workflow_dispatch:
inputs:
build_type:
description: 'Build type (all, basic, hardsubx)'
required: false
default: 'all'
# Build on pushes to workflow file for testing
push:
paths:
- '.github/workflows/build_linux_systemlibs.yml'
- 'linux/build'
permissions:
contents: write
jobs:
build-systemlibs:
runs-on: ubuntu-22.04
strategy:
fail-fast: false
matrix:
build_type: [basic, hardsubx]
steps:
- name: Check if should build this variant
id: should_build
run: |
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
INPUT_TYPE="${{ github.event.inputs.build_type }}"
if [ "$INPUT_TYPE" = "all" ] || [ "$INPUT_TYPE" = "${{ matrix.build_type }}" ]; then
echo "should_build=true" >> $GITHUB_OUTPUT
else
echo "should_build=false" >> $GITHUB_OUTPUT
fi
else
echo "should_build=true" >> $GITHUB_OUTPUT
fi
- name: Checkout repository
if: steps.should_build.outputs.should_build == 'true'
uses: actions/checkout@v6
- name: Install base dependencies
if: steps.should_build.outputs.should_build == 'true'
run: |
sudo apt-get update
sudo apt-get install -y --no-install-recommends \
build-essential \
pkg-config \
zlib1g-dev \
libpng-dev \
libfreetype-dev \
libutf8proc-dev \
libgpac-dev \
libtesseract-dev \
libleptonica-dev \
tesseract-ocr-eng \
clang \
libclang-dev
- name: Install FFmpeg dependencies (HardSubX)
if: steps.should_build.outputs.should_build == 'true' && matrix.build_type == 'hardsubx'
run: |
sudo apt-get install -y --no-install-recommends \
libavcodec-dev \
libavformat-dev \
libavutil-dev \
libswscale-dev \
libswresample-dev \
libavfilter-dev \
libavdevice-dev \
libxcb1-dev \
libxcb-shm0-dev \
libx11-dev \
liblzma-dev
- name: Install Rust toolchain
if: steps.should_build.outputs.should_build == 'true'
uses: dtolnay/rust-toolchain@stable
- name: Build with system libraries
if: steps.should_build.outputs.should_build == 'true'
run: |
cd linux
if [ "${{ matrix.build_type }}" = "hardsubx" ]; then
./build -system-libs -hardsubx
else
./build -system-libs
fi
- name: Verify build
if: steps.should_build.outputs.should_build == 'true'
run: |
./linux/ccextractor --version
echo "=== Library dependencies ==="
ldd ./linux/ccextractor | grep -E 'freetype|png|utf8proc|tesseract|leptonica' || true
- name: Get output name
if: steps.should_build.outputs.should_build == 'true'
id: output_name
run: |
case "${{ matrix.build_type }}" in
basic)
echo "name=ccextractor-linux-systemlibs-x86_64" >> $GITHUB_OUTPUT
;;
hardsubx)
echo "name=ccextractor-linux-systemlibs-hardsubx-x86_64" >> $GITHUB_OUTPUT
;;
esac
- name: Package binary
if: steps.should_build.outputs.should_build == 'true'
run: |
mkdir -p package
cp linux/ccextractor package/
# Create a simple README for the package
cat > package/README.txt << 'EOF'
CCExtractor - System Libraries Build
=====================================
This build uses system libraries (dynamic linking).
Required system packages (Debian/Ubuntu):
sudo apt install libgpac12 libtesseract5 libleptonica6 \
libpng16-16 libfreetype6 libutf8proc3
For HardSubX builds, also install:
sudo apt install libavcodec60 libavformat60 libswscale7 libavfilter9
Run with: ./ccextractor --help
EOF
tar -czvf ${{ steps.output_name.outputs.name }}.tar.gz -C package .
- name: Upload artifact
if: steps.should_build.outputs.should_build == 'true'
uses: actions/upload-artifact@v6
with:
name: ${{ steps.output_name.outputs.name }}
path: ${{ steps.output_name.outputs.name }}.tar.gz
- name: Upload to Release
if: steps.should_build.outputs.should_build == 'true' && github.event_name == 'release'
uses: softprops/action-gh-release@v2
with:
files: ${{ steps.output_name.outputs.name }}.tar.gz
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

View File

@@ -7,6 +7,8 @@ on:
- '.github/workflows/build_mac.yml'
- '**.c'
- '**.h'
- '**CMakeLists.txt'
- '**.cmake'
- '**Makefile**'
- 'mac/**'
- 'package_creators/**'
@@ -17,6 +19,8 @@ on:
- '.github/workflows/build_mac.yml'
- '**.c'
- '**.h'
- '**CMakeLists.txt'
- '**.cmake'
- '**Makefile**'
- 'mac/**'
- 'package_creators/**'

51
.github/workflows/build_snap.yml vendored Normal file
View File

@@ -0,0 +1,51 @@
name: Build CCExtractor Snap
on:
workflow_dispatch:
release:
types: [published]
jobs:
build_snap:
name: Build Snap package
runs-on: ubuntu-22.04
steps:
- name: Checkout repository
uses: actions/checkout@v6
- name: Install snapd
run: |
sudo apt update
sudo apt install -y snapd
- name: Start snapd
run: |
sudo systemctl start snapd.socket
sudo systemctl start snapd
- name: Install Snapcraft
run: |
sudo snap install core22
sudo snap install snapcraft --classic
- name: Show Snapcraft version
run: snapcraft --version
- name: Build snap
run: sudo snapcraft --destructive-mode
- name: List generated snap
run: ls -lh *.snap
- name: Upload snap as workflow artifact
uses: actions/upload-artifact@v6
with:
name: CCExtractor Snap
path: "*.snap"
- name: Upload snap to GitHub Release
if: github.event_name == 'release'
uses: softprops/action-gh-release@v2
with:
files: "*.snap"

View File

@@ -3,7 +3,6 @@ name: Build CCExtractor on Windows
env:
RUSTFLAGS: -Ctarget-feature=+crt-static
VCPKG_DEFAULT_TRIPLET: x64-windows-static
VCPKG_DEFAULT_BINARY_CACHE: C:\vcpkg\.cache
VCPKG_COMMIT: ab2977be50c702126336e5088f4836060733c899
on:
@@ -13,6 +12,8 @@ on:
- ".github/workflows/build_windows.yml"
- "**.c"
- "**.h"
- "**CMakeLists.txt"
- "**.cmake"
- "windows/**"
- "src/rust/**"
pull_request:
@@ -21,108 +22,118 @@ on:
- ".github/workflows/build_windows.yml"
- "**.c"
- "**.h"
- "**CMakeLists.txt"
- "**.cmake"
- "windows/**"
- "src/rust/**"
jobs:
build_release:
build:
runs-on: windows-2022
steps:
- name: Check out repository
uses: actions/checkout@v6
- name: Setup MSBuild.exe
uses: microsoft/setup-msbuild@v2.0.0
with:
msbuild-architecture: x64
# Install GPAC (fast, ~30s, not worth caching complexity)
- name: Install gpac
run: choco install gpac --version 2.4.0
run: choco install gpac --version 2.4.0 --no-progress
# Use lukka/run-vcpkg for better caching
- name: Setup vcpkg
run: mkdir C:\vcpkg\.cache
- name: Cache vcpkg
id: cache
uses: lukka/run-vcpkg@v11
id: runvcpkg
with:
vcpkgGitCommitId: ${{ env.VCPKG_COMMIT }}
vcpkgDirectory: ${{ github.workspace }}/vcpkg
vcpkgJsonGlob: 'windows/vcpkg.json'
# Cache vcpkg installed packages separately for faster restores
- name: Cache vcpkg installed packages
id: vcpkg-installed-cache
uses: actions/cache@v5
with:
path: ${{ github.workspace }}/vcpkg/installed
key: vcpkg-installed-${{ runner.os }}-${{ env.VCPKG_COMMIT }}-${{ hashFiles('windows/vcpkg.json') }}
restore-keys: |
vcpkg-installed-${{ runner.os }}-${{ env.VCPKG_COMMIT }}-
- name: Install vcpkg dependencies
if: steps.vcpkg-installed-cache.outputs.cache-hit != 'true'
run: ${{ github.workspace }}/vcpkg/vcpkg.exe install --x-install-root ${{ github.workspace }}/vcpkg/installed/
working-directory: windows
# Cache Rust/Cargo artifacts
- name: Cache Cargo registry
uses: actions/cache@v5
with:
path: |
C:\vcpkg\.cache
key: vcpkg-${{ runner.os }}-${{ env.VCPKG_COMMIT }}
- name: Build vcpkg
run: |
git clone https://github.com/microsoft/vcpkg
./vcpkg/bootstrap-vcpkg.bat
- name: Install dependencies
run: ${{ github.workspace }}/vcpkg/vcpkg.exe install --x-install-root ${{ github.workspace }}/vcpkg/installed/
working-directory: windows
- uses: actions-rs/toolchain@v1
~/.cargo/registry
~/.cargo/git
key: ${{ runner.os }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }}
restore-keys: |
${{ runner.os }}-cargo-registry-
# Cache Cargo build artifacts - rust.bat sets CARGO_TARGET_DIR to windows/
# which results in artifacts at windows/x86_64-pc-windows-msvc/
- name: Cache Cargo build artifacts
uses: actions/cache@v5
with:
toolchain: stable
override: true
path: ${{ github.workspace }}/windows/x86_64-pc-windows-msvc
key: ${{ runner.os }}-cargo-build-${{ hashFiles('**/Cargo.lock') }}-${{ hashFiles('src/rust/**/*.rs') }}
restore-keys: |
${{ runner.os }}-cargo-build-${{ hashFiles('**/Cargo.lock') }}-
${{ runner.os }}-cargo-build-
- name: Setup Rust toolchain
uses: dtolnay/rust-toolchain@stable
- name: Install Win 10 SDK
uses: ilammy/msvc-dev-cmd@v1
- name: build Release-Full
# Build Release-Full
- name: Build Release-Full
env:
LIBCLANG_PATH: "C:\\Program Files\\LLVM\\lib"
LLVM_CONFIG_PATH: "C:\\Program Files\\LLVM\\bin\\llvm-config"
CARGO_TARGET_DIR: "..\\..\\windows"
BINDGEN_EXTRA_CLANG_ARGS: -fmsc-version=0
VCPKG_ROOT: ${{ github.workspace }}/vcpkg
run: msbuild ccextractor.sln /p:Configuration=Release-Full /p:Platform=x64
working-directory: ./windows
- name: Display version information
- name: Display Release version information
run: ./ccextractorwinfull.exe --version
working-directory: ./windows/x64/Release-Full
- uses: actions/upload-artifact@v6
- name: Upload Release artifact
uses: actions/upload-artifact@v6
with:
name: CCExtractor Windows Release build
path: |
./windows/x64/Release-Full/ccextractorwinfull.exe
./windows/x64/Release-Full/*.dll
build_debug:
runs-on: windows-2022
steps:
- name: Check out repository
uses: actions/checkout@v6
- name: Setup MSBuild.exe
uses: microsoft/setup-msbuild@v2.0.0
with:
msbuild-architecture: x64
- name: Install gpac
run: choco install gpac --version 2.4.0
- name: Setup vcpkg
run: mkdir C:\vcpkg\.cache
- name: Cache vcpkg
id: cache
uses: actions/cache@v5
with:
path: |
C:\vcpkg\.cache
key: vcpkg-${{ runner.os }}-${{ env.VCPKG_COMMIT }}
- name: Build vcpkg
run: |
git clone https://github.com/microsoft/vcpkg
./vcpkg/bootstrap-vcpkg.bat
- name: Install dependencies
run: ${{ github.workspace }}/vcpkg/vcpkg.exe install --x-install-root ${{ github.workspace }}/vcpkg/installed/
working-directory: windows
- uses: actions-rs/toolchain@v1
with:
toolchain: stable
override: true
- name: Install Win 10 SDK
uses: ilammy/msvc-dev-cmd@v1
- name: build Debug-Full
# Build Debug-Full (reuses cached Cargo artifacts)
- name: Build Debug-Full
env:
LIBCLANG_PATH: "C:\\Program Files\\LLVM\\lib"
LLVM_CONFIG_PATH: "C:\\Program Files\\LLVM\\bin\\llvm-config"
CARGO_TARGET_DIR: "..\\..\\windows"
BINDGEN_EXTRA_CLANG_ARGS: -fmsc-version=0
VCPKG_ROOT: ${{ github.workspace }}/vcpkg
run: msbuild ccextractor.sln /p:Configuration=Debug-Full /p:Platform=x64
working-directory: ./windows
- name: Display version information
- name: Display Debug version information
continue-on-error: true
run: ./ccextractorwinfull.exe --version
working-directory: ./windows/x64/Debug-Full
- uses: actions/upload-artifact@v6
- name: Upload Debug artifact
uses: actions/upload-artifact@v6
with:
name: CCExtractor Windows Debug build
path: |

15
.github/workflows/homebrew.yml vendored Normal file
View File

@@ -0,0 +1,15 @@
name: Bump Homebrew Formula
on:
release:
types: [published]
jobs:
homebrew:
runs-on: ubuntu-latest
steps:
- name: Update Homebrew formula
uses: dawidd6/action-homebrew-bump-formula@v7
with:
token: ${{ secrets.HOMEBREW_GITHUB_API_TOKEN }}
formula: ccextractor

136
.github/workflows/publish_chocolatey.yml vendored Normal file
View File

@@ -0,0 +1,136 @@
# Publish to Chocolatey Community Repository
#
# PREREQUISITES:
# 1. Create a Chocolatey account at https://community.chocolatey.org/account/Register
# 2. Get your API key from https://community.chocolatey.org/account
# 3. Add the API key as repository secret: CHOCOLATEY_API_KEY
#
# Reference: https://docs.chocolatey.org/en-us/create/create-packages-quick-start
name: Publish to Chocolatey
on:
release:
types: [released]
workflow_dispatch:
inputs:
release_tag:
description: 'Release tag to publish (e.g., v0.96.1)'
required: true
type: string
jobs:
publish:
runs-on: windows-latest
steps:
- name: Checkout repository
uses: actions/checkout@v6
- name: Get version from tag
id: version
shell: bash
run: |
TAG="${{ github.event.inputs.release_tag || github.event.release.tag_name }}"
# Strip 'v' prefix if present
VERSION="${TAG#v}"
echo "version=$VERSION" >> $GITHUB_OUTPUT
echo "tag=$TAG" >> $GITHUB_OUTPUT
- name: Download MSI from release
shell: pwsh
run: |
$version = "${{ steps.version.outputs.version }}"
$tag = "${{ steps.version.outputs.tag }}"
$msiUrl = "https://github.com/CCExtractor/ccextractor/releases/download/$tag/CCExtractor.$version.msi"
Write-Host "Downloading MSI from: $msiUrl"
Invoke-WebRequest -Uri $msiUrl -OutFile "CCExtractor.msi"
# Calculate SHA256 checksum
$hash = (Get-FileHash -Path "CCExtractor.msi" -Algorithm SHA256).Hash
Write-Host "SHA256: $hash"
echo "MSI_CHECKSUM=$hash" >> $env:GITHUB_ENV
- name: Update nuspec version
shell: pwsh
run: |
$version = "${{ steps.version.outputs.version }}"
$nuspecPath = "packaging/chocolatey/ccextractor.nuspec"
$content = Get-Content $nuspecPath -Raw
$content = $content -replace '<version>.*</version>', "<version>$version</version>"
Set-Content -Path $nuspecPath -Value $content
Write-Host "Updated nuspec to version $version"
- name: Update install script
shell: pwsh
run: |
$version = "${{ steps.version.outputs.version }}"
$tag = "${{ steps.version.outputs.tag }}"
$checksum = $env:MSI_CHECKSUM
$installScript = "packaging/chocolatey/tools/chocolateyInstall.ps1"
$content = Get-Content $installScript -Raw
# Update URL
$newUrl = "https://github.com/CCExtractor/ccextractor/releases/download/$tag/CCExtractor.$version.msi"
$content = $content -replace "url64bit\s*=\s*'[^']*'", "url64bit = '$newUrl'"
# Update checksum
$content = $content -replace "checksum64\s*=\s*'[^']*'", "checksum64 = '$checksum'"
Set-Content -Path $installScript -Value $content
Write-Host "Updated install script with URL and checksum"
- name: Build Chocolatey package
shell: pwsh
run: |
cd packaging/chocolatey
choco pack ccextractor.nuspec
# List the generated package
Get-ChildItem *.nupkg
- name: Test package locally
shell: pwsh
run: |
cd packaging/chocolatey
$nupkg = Get-ChildItem *.nupkg | Select-Object -First 1
Write-Host "Testing package: $($nupkg.Name)"
# Install from local package
choco install ccextractor --source="'.;https://community.chocolatey.org/api/v2/'" --yes --force
# Verify installation
$ccx = Get-Command ccextractor -ErrorAction SilentlyContinue
if ($ccx) {
Write-Host "CCExtractor found at: $($ccx.Source)"
& ccextractor --version
} else {
Write-Host "CCExtractor not found in PATH, checking Program Files..."
$exePath = Join-Path $env:ProgramFiles "CCExtractor\ccextractor.exe"
if (Test-Path $exePath) {
& $exePath --version
}
}
- name: Push to Chocolatey
shell: pwsh
env:
CHOCOLATEY_API_KEY: ${{ secrets.CHOCOLATEY_API_KEY }}
run: |
cd packaging/chocolatey
$nupkg = Get-ChildItem *.nupkg | Select-Object -First 1
Write-Host "Pushing $($nupkg.Name) to Chocolatey..."
choco push $nupkg.Name --source="https://push.chocolatey.org/" --api-key="$env:CHOCOLATEY_API_KEY"
Write-Host "Package submitted to Chocolatey! It may take some time to be moderated and published."
- name: Upload package artifact
uses: actions/upload-artifact@v6
with:
name: chocolatey-package
path: packaging/chocolatey/*.nupkg

38
.github/workflows/publish_winget.yml vendored Normal file
View File

@@ -0,0 +1,38 @@
# Publish to Windows Package Manager (winget)
#
# PREREQUISITES:
# 1. CCExtractor must already have ONE version in winget-pkgs before this works
# - Submit the initial manifest manually from packaging/winget/
# - PR to: https://github.com/microsoft/winget-pkgs
#
# 2. Create a fork of microsoft/winget-pkgs under the CCExtractor organization
# - https://github.com/CCExtractor/winget-pkgs (needs to be created)
#
# 3. Create a GitHub Personal Access Token (classic) with 'public_repo' scope
# - Add as repository secret: WINGET_TOKEN
#
# Reference: https://github.com/vedantmgoyal9/winget-releaser
name: Publish to WinGet
on:
release:
types: [released]
workflow_dispatch:
inputs:
release_tag:
description: 'Release tag to publish (e.g., v0.96.1)'
required: true
type: string
jobs:
publish:
runs-on: windows-latest
steps:
- name: Publish to WinGet
uses: vedantmgoyal9/winget-releaser@v2
with:
identifier: CCExtractor.CCExtractor
installers-regex: '\.msi$' # Only use the MSI installer
token: ${{ secrets.WINGET_TOKEN }}
release-tag: ${{ github.event.inputs.release_tag || github.event.release.tag_name }}

View File

@@ -26,7 +26,20 @@ jobs:
# Extract version from tag, strip 'v' prefix and everything after first dash
VERSION=${GITHUB_REF/refs\/tags\/v/}
VERSION=${VERSION%%-*}
echo ::set-output name=VERSION::$VERSION
# Save display version for filenames (e.g., 0.96.1)
echo ::set-output name=DISPLAY_VERSION::$VERSION
# Count dots to determine version format
DOTS="${VERSION//[^.]}"
PART_COUNT=$((${#DOTS} + 1))
# MSI requires 4-part version (major.minor.build.revision)
if [ "$PART_COUNT" -eq 2 ]; then
MSI_VERSION="${VERSION}.0.0"
elif [ "$PART_COUNT" -eq 3 ]; then
MSI_VERSION="${VERSION}.0"
else
MSI_VERSION="${VERSION}"
fi
echo ::set-output name=VERSION::$MSI_VERSION
shell: bash
- name: Setup MSBuild.exe
uses: microsoft/setup-msbuild@v2.0.0
@@ -68,6 +81,14 @@ jobs:
- name: Copy files to directory for installer
run: mkdir installer; cp ./x64/Release-Full/ccextractorwinfull.exe ./installer; cp ./x64/Release-Full/*.dll ./installer
working-directory: ./windows
- name: Download tessdata for OCR support
run: |
mkdir -p ./installer/tessdata
# Download English traineddata from tessdata_fast (smaller, faster, good for most use cases)
Invoke-WebRequest -Uri "https://github.com/tesseract-ocr/tessdata_fast/raw/main/eng.traineddata" -OutFile "./installer/tessdata/eng.traineddata"
# Download OSD (Orientation and Script Detection) for automatic script detection
Invoke-WebRequest -Uri "https://github.com/tesseract-ocr/tessdata_fast/raw/main/osd.traineddata" -OutFile "./installer/tessdata/osd.traineddata"
working-directory: ./windows
- name: install WiX
run: dotnet tool uninstall --global wix; dotnet tool install --global wix --version 6.0.2 && wix extension add -g WixToolset.UI.wixext/6.0.2
- name: Make sure WiX works
@@ -85,15 +106,15 @@ jobs:
run: Get-ChildItem -Recurse ./installer
working-directory: ./windows
- name: Create portable zip
run: Compress-Archive -Path ./installer/* -DestinationPath ./CCExtractor_win_portable.zip
run: Compress-Archive -Path ./installer/* -DestinationPath ./CCExtractor.${{ steps.get_version.outputs.DISPLAY_VERSION }}_win_portable.zip
working-directory: ./windows
- name: Build installer
run: wix build -ext WixToolset.UI.wixext -d "AppVersion=${{ steps.get_version.outputs.VERSION }}.0.0" -o CCExtractor.msi installer.wxs CustomUI.wxs
run: wix build -arch x64 -ext WixToolset.UI.wixext -d "AppVersion=${{ steps.get_version.outputs.VERSION }}" -o CCExtractor.${{ steps.get_version.outputs.DISPLAY_VERSION }}.msi installer.wxs CustomUI.wxs
working-directory: ./windows
- name: Upload as asset
uses: AButler/upload-release-assets@v3.0
with:
files: './windows/CCExtractor.msi;./windows/CCExtractor_win_portable.zip'
files: './windows/CCExtractor.${{ steps.get_version.outputs.DISPLAY_VERSION }}.msi;./windows/CCExtractor.${{ steps.get_version.outputs.DISPLAY_VERSION }}_win_portable.zip'
repo-token: ${{ secrets.GITHUB_TOKEN }}
create_linux_package:
runs-on: ubuntu-latest
@@ -101,10 +122,16 @@ jobs:
- uses: actions/checkout@v6
with:
path: ./ccextractor
- name: Get the version
id: get_version
run: |
VERSION=${GITHUB_REF/refs\/tags\/v/}
VERSION=${VERSION%%-*}
echo ::set-output name=DISPLAY_VERSION::$VERSION
- name: Create .tar.gz without git and windows folders
run: tar -pczf ./ccextractor_minimal.tar.gz --exclude "ccextractor/windows" --exclude "ccextractor/.git" ccextractor
run: tar -pczf ./ccextractor.${{ steps.get_version.outputs.DISPLAY_VERSION }}.tar.gz --exclude "ccextractor/windows" --exclude "ccextractor/.git" ccextractor
- name: Upload as asset
uses: AButler/upload-release-assets@v3.0
with:
files: './ccextractor_minimal.tar.gz'
files: './ccextractor.${{ steps.get_version.outputs.DISPLAY_VERSION }}.tar.gz'
repo-token: ${{ secrets.GITHUB_TOKEN }}

5
.gitignore vendored
View File

@@ -17,6 +17,7 @@ CVS
mac/ccextractor
linux/ccextractor
linux/depend
linux/build_scan/
windows/x86_64-pc-windows-msvc/**
windows/Debug/**
windows/Debug-OCR/**
@@ -28,6 +29,7 @@ windows/Debug-Full/**
windows/x64/**
windows/ccextractor.VC.db
build/
build_*/
####
# Python
@@ -143,6 +145,9 @@ bazel*
#Intellij IDEs
.idea/
# Plans (local only)
plans/
# Rust build and MakeFiles (and CMake files)
src/rust/CMakeFiles/
src/rust/CMakeCache.txt

View File

@@ -4,7 +4,7 @@ MAINTAINER = Marc Espie <espie@openbsd.org>
CATEGORIES = multimedia
COMMENT = closed caption subtitles extractor
HOMEPAGE = https://ccextractor.org
V = 0.96
V = 0.96.5
DISTFILES = ccextractor.${V:S/.//}-src.zip
MASTER_SITES = ${MASTER_SITE_SOURCEFORGE:=ccextractor/}
DISTNAME = ccextractor-$V

View File

@@ -2,7 +2,6 @@
# CCExtractor
<a href="https://travis-ci.org/CCExtractor/ccextractor"><img src="https://raw.githubusercontent.com/CCExtractor/ccextractor-org-media/master/static/macOS-build-badge-logo.png" width="20"></a> [![Build Status](https://travis-ci.org/CCExtractor/ccextractor.svg?branch=master)](https://travis-ci.org/CCExtractor/ccextractor)
[![Sample-Platform Build Status Windows](https://sampleplatform.ccextractor.org/static/img/status/build-windows.svg?maxAge=1800)](https://sampleplatform.ccextractor.org/test/master/windows)
[![Sample-Platform Build Status Linux](https://sampleplatform.ccextractor.org/static/img/status/build-linux.svg?maxAge=1800)](https://sampleplatform.ccextractor.org/test/master/linux)
[![SourceForge](https://img.shields.io/badge/SourceForge%20downloads-213k%2Ftotal-brightgreen.svg)](https://sourceforge.net/projects/ccextractor/)
@@ -29,6 +28,25 @@ The core functionality is written in C. Other languages used include C++ and Pyt
Downloads for precompiled binaries and source code can be found [on our website](https://ccextractor.org/public/general/downloads/).
### Windows Package Managers
**WinGet:**
```powershell
winget install CCExtractor.CCExtractor
```
**Chocolatey:**
```powershell
choco install ccextractor
```
**Scoop:**
```powershell
scoop bucket add extras
scoop install ccextractor
```
Extracting subtitles is relatively simple. Just run the following command:
`ccextractor <input>`
@@ -44,6 +62,34 @@ You can also find the list of parameters and their brief description by running
You can find sample files on [our website](https://ccextractor.org/public/general/tvsamples/) to test the software.
### Building from Source
- [Building on Windows using WSL](docs/build-wsl.md)
#### Linux (Autotools) build notes
CCExtractor also supports an autotools-based build system under the `linux/`
directory.
Important notes:
- The autotools workflow lives inside `linux/`. The `configure` script is
generated there and should be run from that directory.
- Typical build steps are:
```
cd linux
./autogen.sh
./configure
make
```
- Rust support is enabled automatically if `cargo` and `rustc` are available
on the system. In that case, Rust components are built and linked during
`make`.
- If you encounter unexpected build or linking issues, a clean rebuild
(`make clean` or a fresh clone) is recommended, especially when Rust is
involved.
This build flow has been tested on Linux and WSL.
## Compiling CCExtractor
To learn more about how to compile and build CCExtractor for your platform check the [compilation guide](https://github.com/CCExtractor/ccextractor/blob/master/docs/COMPILATION.MD).

View File

@@ -1,9 +1,68 @@
0.96.6 (unreleased)
-------------------
- New: Add Snap packaging support with Snapcraft configuration and GitHub Actions CI workflow.
- Fix: Clear status line output on Linux/WSL to prevent text artifacts (#2017)
- Fix: Prevent infinite loop on truncated MKV files
- Fix: Various memory safety and stability fixes in demuxers (MP4, PS, MKV, DVB)
- Fix: Delete empty output files instead of leaving 0-byte files (#1282)
- Fix: --mkvlang now supports BCP 47 language tags (e.g., en-US, zh-Hans-CN) and multiple codes
0.96.5 (2026-01-05)
-------------------
- New: CCExtractor is available again via Homebrew on macOS and Linux.
- New: Add support for raw CDP (Caption Distribution Packet) files (#1406)
- New: Add --scc-accurate-timing option for bandwidth-aware SCC output (#1120)
- Fix: MXF files containing CEA-708 captions not being detected/extracted (#1647)
- Docs: Add Windows WSL build instructions
- Fix: Security fixes (out-of-bounds read/write) in a few places in the legacy C code.
0.96.4 (2026-01-01)
-------------------
- New: Persistent CEA-708 decoder context - maintains state across multiple calls for proper subtitle continuity
- New: OCR character blacklist options (--ocr-blacklist, --ocr-blacklist-file) for improved accuracy
- New: OCR line-split option (--ocr-splitontimechange) for better subtitle segmentation
- Fix: 32-bit build failures on i686 and armv7l architectures
- Fix: Legacy command-line argument compatibility (-1, -2, -12, --sc, --svc)
- Fix: Prevent heap buffer overflow in Teletext processing (security fix)
- Fix: Prevent integer overflow leading to heap buffer overflow in Transport Stream handling (security fix)
- Fix: Lazy OCR initialization - only initialize when first DVB subtitle is encountered
- Build: Optimized Windows CI workflow for faster builds
- Fix: Updated GUI with version 0.7.1. A blind attempt to fix a hang on start on some Windows.
0.96.3 (2025-12-29)
-------------------
- New: VOBSUB subtitle extraction with OCR support for MP4 files
- New: VOBSUB subtitle extraction support for MKV/Matroska files
- New: Native SCC (Scenarist Closed Caption) input file support - CCExtractor can now read SCC files
- New: Configurable frame rate (--scc-framerate) and styled PAC codes for SCC output
- Fix: Apply --delay option to DVB/bitmap subtitles (previously only worked with text-based subtitles)
- Fix: 200ms timing offset in MOV/MP4 caption extraction
- Fix: utf8proc include path for system library builds
- Fix: Use fixed-width integer types in MP4 bswap functions for better portability
- Fix: Guard ocr_text access with ENABLE_OCR preprocessor check
- Fix: Preserve FFmpeg libs when building with -system-libs -hardsubx
- Build: Add vobsub_decoder to Windows and autoconf build systems
- Build: Add winget and Chocolatey packaging workflows for Windows distribution
- Docs: Add VOBSUB extraction documentation and subtile-ocr Dockerfile
0.96.2 (2025-12-26)
-------------------
- Fix: Resolve utf8proc header include path when building against system libraries on Linux.
- Rebundle Windows version to include required runtime files to process hardcoded subtitles
(hardcodex mode).
- New: Add optional -system-libs flag to Linux build script for package manager compatibility
0.96.1 (2025-12-25)
-------------------
- Rebundle Windows version to include an updated GUI. No changes in CCExtractor itself.
0.96 (2025-12-23)
-----------------
- New: Multi-page teletext extraction support (#665)
- Extract multiple teletext pages simultaneously with separate output files
- Use --tpage multiple times (e.g., --tpage 100 --tpage 200)
- Output files are named with page suffix (e.g., output_p100.srt, output_p200.srt)
- Fix: SPUPNG subtitle offset calculation to center based on actual image dimensions
- New: Added --list-tracks (-L) option to list all tracks in media files without processing
New: Chinese, Korean, Japanese support - proper encoding and OCR.

View File

@@ -1,3 +1,16 @@
# Installation
## Homebrew
The easiest way to install CCExtractor for Mac and Linux is through Homebrew:
```bash
brew install ccextractor
```
Note: If you don't have Homebrew installed, see [brew.sh](https://brew.sh/)
for installation instructions.
---
# Compiling CCExtractor
You may compile CCExtractor across all major platforms using `CMakeLists.txt` stored under `ccextractor/src/` directory. Autoconf and custom build scripts are also available. See platform specific instructions in the below sections.

View File

@@ -26,6 +26,14 @@ Running ccextractor without parameters shows the help screen. Usage is
trivial - you just need to pass the input file and (optionally) some
details about the input and output files.
Example:
ccextractor input_video.ts
This command extracts subtitles from the input video file and generates a subtitle output file
(such as .srt) in the same directory.
## Languages
Usually English captions are transmitted in line 21 field 1 data,

129
docs/VOBSUB.md Normal file
View File

@@ -0,0 +1,129 @@
# VOBSUB Subtitle Extraction from MKV Files
CCExtractor supports extracting VOBSUB (S_VOBSUB) subtitles from Matroska (MKV) containers. VOBSUB is an image-based subtitle format originally from DVD video.
## Overview
VOBSUB subtitles consist of two files:
- `.idx` - Index file containing metadata, palette, and timestamp/position entries
- `.sub` - Binary file containing the actual subtitle bitmap data in MPEG Program Stream format
## Basic Usage
```bash
ccextractor movie.mkv
```
This will extract all VOBSUB tracks and create paired `.idx` and `.sub` files:
- `movie_eng.idx` + `movie_eng.sub` (first English track)
- `movie_eng_1.idx` + `movie_eng_1.sub` (second English track, if present)
- etc.
## Converting VOBSUB to SRT (Text)
Since VOBSUB subtitles are images, you need OCR (Optical Character Recognition) to convert them to text-based formats like SRT.
### Using subtile-ocr (Recommended)
[subtile-ocr](https://github.com/gwen-lg/subtile-ocr) is an actively maintained Rust tool that provides accurate OCR conversion.
#### Option 1: Docker (Easiest)
We provide a Dockerfile that builds subtile-ocr with all dependencies:
```bash
# Build the Docker image (one-time)
cd tools/vobsubocr
docker build -t subtile-ocr .
# Extract VOBSUB from MKV
ccextractor movie.mkv
# Convert to SRT using OCR
docker run --rm -v $(pwd):/data subtile-ocr -l eng -o /data/movie_eng.srt /data/movie_eng.idx
```
#### Option 2: Install subtile-ocr Natively
If you have Rust and Tesseract development libraries installed:
```bash
# Install dependencies (Ubuntu/Debian)
sudo apt-get install libleptonica-dev libtesseract-dev tesseract-ocr tesseract-ocr-eng
# Install subtile-ocr
cargo install --git https://github.com/gwen-lg/subtile-ocr
# Convert
subtile-ocr -l eng -o movie_eng.srt movie_eng.idx
```
### subtile-ocr Options
| Option | Description |
|--------|-------------|
| `-l, --lang <LANG>` | Tesseract language code (required). Examples: `eng`, `fra`, `deu`, `chi_sim` |
| `-o, --output <FILE>` | Output SRT file (stdout if not specified) |
| `-t, --threshold <0.0-1.0>` | Binarization threshold (default: 0.6) |
| `-d, --dpi <DPI>` | Image DPI for OCR (default: 150) |
| `--dump` | Save processed subtitle images as PNG files |
### Language Codes
Install additional Tesseract language packs as needed:
```bash
# Examples
sudo apt-get install tesseract-ocr-fra # French
sudo apt-get install tesseract-ocr-deu # German
sudo apt-get install tesseract-ocr-spa # Spanish
sudo apt-get install tesseract-ocr-chi-sim # Simplified Chinese
```
## Technical Details
### .idx File Format
The index file contains:
1. Header with metadata (size, palette, alignment settings)
2. Language identifier line
3. Timestamp entries with file positions
Example:
```
# VobSub index file, v7 (do not modify this line!)
size: 720x576
palette: 000000, 828282, ...
id: eng, index: 0
timestamp: 00:01:12:920, filepos: 000000000
timestamp: 00:01:18:640, filepos: 000000800
...
```
### .sub File Format
The binary file contains MPEG Program Stream packets:
- Each subtitle is wrapped in a PS Pack header (14 bytes) + PES header (15 bytes)
- Subtitles are aligned to 2048-byte boundaries
- Contains raw SPU (SubPicture Unit) bitmap data
## Troubleshooting
### Empty output files
- Ensure the MKV file actually contains VOBSUB tracks (check with `mediainfo` or `ffprobe`)
- CCExtractor will report "No VOBSUB subtitles to write" if the track is empty
### OCR quality issues
- Try adjusting the `-t` threshold parameter
- Ensure the correct language pack is installed
- Use `--dump` to inspect the processed images
### Docker permission issues
- The output files may be owned by root; use `sudo chown` to fix ownership
- Or run Docker with `--user $(id -u):$(id -g)`
## See Also
- [OCR.md](OCR.md) - General OCR support in CCExtractor
- [subtile-ocr GitHub](https://github.com/gwen-lg/subtile-ocr) - OCR tool documentation

137
docs/build-wsl.md Normal file
View File

@@ -0,0 +1,137 @@
# Building CCExtractor on Windows using WSL
This guide explains how to build CCExtractor on Windows using WSL (Ubuntu).
It is based on a fresh setup and includes all required dependencies and
common build issues encountered during compilation.
---
## Prerequisites
- Windows 10 or Windows 11
- WSL enabled
- Ubuntu installed via Microsoft Store
---
## Install WSL and Ubuntu
From PowerShell (run as Administrator):
```powershell
wsl --install -d Ubuntu
```
Restart the system if prompted, then launch Ubuntu from the Start menu.
---
## Update system packages
```bash
sudo apt update
```
---
## Install basic build tools
```bash
sudo apt install -y build-essential git pkg-config
```
---
## Install Rust (required)
CCExtractor includes Rust components, so Rust and Cargo are required.
```bash
curl https://sh.rustup.rs -sSf | sh
source ~/.cargo/env
```
Verify installation:
```bash
cargo --version
rustc --version
```
---
## Install required libraries
```bash
sudo apt install -y \
libclang-dev clang \
libtesseract-dev tesseract-ocr \
libgpac-dev
```
---
## Clone the repository
```bash
git clone https://github.com/CCExtractor/ccextractor.git
cd ccextractor
```
---
## Build CCExtractor
```bash
cd linux
./build
```
After a successful build, verify by running:
```bash
./ccextractor
```
You should see the help/usage output.
---
## Common build issues
### cargo: command not found
```bash
source ~/.cargo/env
```
---
### Unable to find libclang
```bash
sudo apt install libclang-dev clang
```
---
### gpac/isomedia.h: No such file or directory
```bash
sudo apt install libgpac-dev
```
---
### please install tesseract development library
```bash
sudo apt install libtesseract-dev tesseract-ocr
```
---
## Notes
- Compiler warnings during the build process are expected and do not indicate failure.
- This guide was tested on Ubuntu (WSL) running on Windows 11.

View File

@@ -151,6 +151,8 @@ ccextractor_SOURCES = \
../src/lib_ccx/list.h \
../src/lib_ccx/matroska.c \
../src/lib_ccx/matroska.h \
../src/lib_ccx/vobsub_decoder.c \
../src/lib_ccx/vobsub_decoder.h \
../src/lib_ccx/mp4.c \
../src/lib_ccx/myth.c \
../src/lib_ccx/networking.c \

View File

@@ -2,6 +2,7 @@
RUST_LIB="rust/release/libccx_rust.a"
RUST_PROFILE="--release"
USE_SYSTEM_LIBS=false
while [[ $# -gt 0 ]]; do
case $1 in
-debug)
@@ -23,6 +24,10 @@ while [[ $# -gt 0 ]]; do
BLD_LINKER="$BLD_LINKER -lswscale -lavutil -pthread -lavformat -lavcodec -lavfilter -lxcb-shm -lxcb -lX11 -llzma -lswresample"
shift
;;
-system-libs)
USE_SYSTEM_LIBS=true
shift
;;
-*)
echo "Unknown option $1"
exit 1
@@ -30,7 +35,42 @@ while [[ $# -gt 0 ]]; do
esac
done
BLD_FLAGS="$BLD_FLAGS -std=gnu99 -Wno-write-strings -Wno-pointer-sign -D_FILE_OFFSET_BITS=64 -DVERSION_FILE_PRESENT -DENABLE_OCR -DFT2_BUILD_LIBRARY -DGPAC_DISABLE_VTT -DGPAC_DISABLE_OD_DUMP -DGPAC_DISABLE_REMOTERY -DNO_GZIP"
if [ "$USE_SYSTEM_LIBS" = true ]; then
command -v pkg-config >/dev/null || {
echo "Error: pkg-config is required for -system-libs mode"
exit 1
}
MISSING=""
for lib in libpng zlib freetype2 libutf8proc; do
if ! pkg-config --exists "$lib" 2>/dev/null; then
MISSING="$MISSING $lib"
fi
done
if [ -n "$MISSING" ]; then
echo "Error: Missing required system libraries:$MISSING"
echo ""
echo "On Debian/Ubuntu: sudo apt install libpng-dev zlib1g-dev libfreetype-dev libutf8proc-dev"
exit 1
fi
for hdr in leptonica/allheaders.h tesseract/capi.h; do
if ! echo "#include <$hdr>" | gcc -E - >/dev/null 2>&1; then
echo "Error: Missing headers for <$hdr>"
echo "On Debian/Ubuntu: sudo apt install libleptonica-dev libtesseract-dev"
exit 1
fi
done
PKG_CFLAGS="$(pkg-config --cflags libpng zlib freetype2 libutf8proc)"
PKG_LIBS="$(pkg-config --libs libpng zlib freetype2 libutf8proc)"
fi
BLD_FLAGS="$BLD_FLAGS -std=gnu99 -Wno-write-strings -Wno-pointer-sign -D_FILE_OFFSET_BITS=64 -DVERSION_FILE_PRESENT -DENABLE_OCR -DGPAC_DISABLE_VTT -DGPAC_DISABLE_OD_DUMP -DGPAC_DISABLE_REMOTERY -DNO_GZIP"
if [ "$USE_SYSTEM_LIBS" != true ]; then
BLD_FLAGS="$BLD_FLAGS -DFT2_BUILD_LIBRARY"
fi
bit_os=$(getconf LONG_BIT)
if [ "$bit_os" == "64" ]
then
@@ -87,6 +127,24 @@ SRC_FREETYPE="../src/thirdparty/freetype/autofit/autofit.c
BLD_SOURCES="../src/ccextractor.c $SRC_CCX $SRC_GPAC $SRC_ZLIB $SRC_LIBPNG $SRC_HASH $SRC_UTF8PROC $SRC_FREETYPE"
BLD_LINKER="$BLD_LINKER -lm -zmuldefs -l tesseract -l leptonica -lpthread -ldl -lgpac"
if [ "$USE_SYSTEM_LIBS" = true ]; then
LEPTONICA_CFLAGS="$(pkg-config --cflags --silence-errors lept)"
TESSERACT_CFLAGS="$(pkg-config --cflags --silence-errors tesseract)"
GPAC_CFLAGS="$(pkg-config --cflags --silence-errors gpac)"
BLD_INCLUDE="-I../src -I../src/lib_ccx -I../src/lib_ccx/zvbi -I../src/thirdparty/lib_hash \
$PKG_CFLAGS $LEPTONICA_CFLAGS $TESSERACT_CFLAGS $GPAC_CFLAGS"
BLD_SOURCES="../src/ccextractor.c $SRC_CCX $SRC_HASH"
# Preserve FFmpeg libraries if -hardsubx was specified
FFMPEG_LIBS=""
if [ "$HARDSUBX" = true ]; then
FFMPEG_LIBS="-lswscale -lavutil -pthread -lavformat -lavcodec -lavfilter -lxcb-shm -lxcb -lX11 -llzma -lswresample"
fi
BLD_LINKER="$PKG_LIBS -ltesseract -lleptonica -lgpac -lpthread -ldl -lm $FFMPEG_LIBS"
fi
echo "Running pre-build script..."
./pre-build.sh
echo "Trying to compile..."
@@ -149,3 +207,7 @@ if [[ "$out" != "" ]] ; then
else
echo "Compilation successful, no compiler messages."
fi
if [ -d ./utf8proc_compat ]; then
rm -rf ./utf8proc_compat
fi

View File

@@ -2,7 +2,7 @@
# Process this file with autoconf to produce a configure script.
AC_PREREQ([2.71])
AC_INIT([CCExtractor], [0.96], [carlos@ccextractor.org])
AC_INIT([CCExtractor], [0.96.5], [carlos@ccextractor.org])
AC_CONFIG_AUX_DIR([build-conf])
AC_CONFIG_SRCDIR([../src/ccextractor.c])
AM_INIT_AUTOMAKE([foreign subdir-objects])

View File

@@ -123,6 +123,8 @@ ccextractor_SOURCES = \
../src/lib_ccx/list.h \
../src/lib_ccx/matroska.c \
../src/lib_ccx/matroska.h \
../src/lib_ccx/vobsub_decoder.c \
../src/lib_ccx/vobsub_decoder.h \
../src/lib_ccx/mp4.c \
../src/lib_ccx/myth.c \
../src/lib_ccx/networking.c \

View File

@@ -42,7 +42,16 @@ while [[ $# -gt 0 ]]; do
esac
done
BLD_FLAGS="-std=gnu99 -Wno-write-strings -Wno-pointer-sign -D_FILE_OFFSET_BITS=64 -DVERSION_FILE_PRESENT -Dfopen64=fopen -Dopen64=open -Dlseek64=lseek"
# Determine architecture based on cargo (to ensure consistency with Rust part)
CARGO_ARCH=$(file $(which cargo) | grep -o 'x86_64\|arm64')
if [[ "$CARGO_ARCH" == "x86_64" ]]; then
echo "Detected Intel (x86_64) Cargo. Forcing x86_64 build to match Rust and libraries..."
BLD_ARCH="-arch x86_64"
else
BLD_ARCH="-arch arm64"
fi
BLD_FLAGS="$BLD_ARCH -std=gnu99 -Wno-write-strings -Wno-pointer-sign -D_FILE_OFFSET_BITS=64 -DVERSION_FILE_PRESENT -Dfopen64=fopen -Dopen64=open -Dlseek64=lseek"
# Add flags for bundled libraries (not needed when using system libs)
if [[ "$USE_SYSTEM_LIBS" != "true" ]]; then

View File

@@ -2,7 +2,7 @@
# Process this file with autoconf to produce a configure script.
AC_PREREQ([2.71])
AC_INIT([CCExtractor],[0.96],[carlos@ccextractor.org])
AC_INIT([CCExtractor],[0.96.5],[carlos@ccextractor.org])
AC_CONFIG_AUX_DIR([build-conf])
AC_CONFIG_SRCDIR([../src/ccextractor.c])
AM_INIT_AUTOMAKE([foreign subdir-objects])

View File

@@ -1,5 +1,5 @@
pkgname=ccextractor
pkgver=0.96
pkgver=0.96.5
pkgrel=1
pkgdesc="A closed captions and teletext subtitles extractor for video streams."
arch=('i686' 'x86_64')

View File

@@ -1,5 +1,5 @@
Name: ccextractor
Version: 0.96
Version: 0.96.5
Release: 1
Summary: A closed captions and teletext subtitles extractor for video streams.
Group: Applications/Internet

View File

@@ -1,7 +1,7 @@
#!/bin/bash
TYPE="debian" # can be one of 'slackware', 'debian', 'rpm'
PROGRAM_NAME="ccextractor"
VERSION="0.96"
VERSION="0.96.5"
RELEASE="1"
LICENSE="GPL-2.0"
MAINTAINER="carlos@ccextractor.org"

96
packaging/README.md Normal file
View File

@@ -0,0 +1,96 @@
# CCExtractor Packaging
This directory contains packaging configurations for Windows package managers.
## Windows Package Manager (winget)
### Initial Setup (One-time)
1. **Calculate MSI hash** for the current release:
```powershell
certutil -hashfile CCExtractor.0.96.1.msi SHA256
```
2. **Update the manifest files** in `winget/` with the SHA256 hash
3. **Fork microsoft/winget-pkgs** to the CCExtractor organization:
- Go to https://github.com/microsoft/winget-pkgs
- Fork to https://github.com/CCExtractor/winget-pkgs
4. **Submit initial manifest** via PR:
- Clone your fork
- Create directory: `manifests/c/CCExtractor/CCExtractor/0.96.1/`
- Copy the three YAML files from `winget/`
- Submit PR to microsoft/winget-pkgs
5. **Create GitHub token** for automation:
- Go to GitHub Settings > Developer settings > Personal access tokens > Tokens (classic)
- Create token with `public_repo` scope
- Add as secret `WINGET_TOKEN` in CCExtractor/ccextractor repository
### Automated Updates
After the initial submission is merged, the `publish_winget.yml` workflow will automatically submit PRs for new releases.
## Chocolatey
### Initial Setup (One-time)
1. **Create Chocolatey account**:
- Register at https://community.chocolatey.org/account/Register
2. **Get API key**:
- Go to https://community.chocolatey.org/account
- Copy your API key
3. **Add secret**:
- Add `CHOCOLATEY_API_KEY` secret to CCExtractor/ccextractor repository
### Package Structure
```
chocolatey/
├── ccextractor.nuspec # Package metadata
└── tools/
├── chocolateyInstall.ps1 # Installation script
└── chocolateyUninstall.ps1 # Uninstallation script
```
### Manual Testing
```powershell
cd packaging/chocolatey
# Update version and checksum in files first, then:
choco pack ccextractor.nuspec
# Test locally
choco install ccextractor --source="'.'" --yes --force
# Verify
ccextractor --version
```
### Automated Updates
The `publish_chocolatey.yml` workflow automatically:
1. Downloads the MSI from the release
2. Calculates the SHA256 checksum
3. Updates the nuspec and install script
4. Builds and tests the package
5. Pushes to Chocolatey
Note: Chocolatey packages go through moderation before being publicly available.
## Workflow Triggers
Both workflows trigger on:
- **Release published**: Automatic publishing when a new release is created
- **Manual dispatch**: Can be triggered manually with a specific tag
## Secrets Required
| Secret | Purpose |
|--------|---------|
| `WINGET_TOKEN` | GitHub PAT with `public_repo` scope for winget PRs |
| `CHOCOLATEY_API_KEY` | Chocolatey API key for package uploads |

View File

@@ -0,0 +1,43 @@
<?xml version="1.0" encoding="utf-8"?>
<package xmlns="http://schemas.microsoft.com/packaging/2015/06/nuspec.xsd">
<metadata>
<id>ccextractor</id>
<version>0.96.5</version>
<title>CCExtractor</title>
<authors>CCExtractor Development Team</authors>
<owners>CCExtractor</owners>
<licenseUrl>https://github.com/CCExtractor/ccextractor/blob/master/LICENSE.txt</licenseUrl>
<projectUrl>https://ccextractor.org</projectUrl>
<iconUrl>https://raw.githubusercontent.com/CCExtractor/ccextractor/master/windows/CCX.ico</iconUrl>
<requireLicenseAcceptance>false</requireLicenseAcceptance>
<description>CCExtractor is a tool that analyzes video files and produces independent subtitle files from the closed captions data.
### Features
- Extracts closed captions from various video formats (MPEG, H.264, MKV, MP4, etc.)
- Supports multiple input sources including DVDs, DVRs, and live TV captures
- Outputs to multiple formats (SRT, WebVTT, SAMI, transcript, etc.)
- OCR support for bitmap-based subtitles (DVB, teletext)
- Includes a graphical user interface
### Usage
After installation, run `ccextractor` from the command line or use the GUI.
```
ccextractor video.ts -o output.srt
```
For more options: `ccextractor --help`
</description>
<summary>Extract closed captions and subtitles from video files</summary>
<releaseNotes>https://github.com/CCExtractor/ccextractor/releases</releaseNotes>
<copyright>Copyright (c) CCExtractor Development</copyright>
<tags>subtitles closed-captions video extraction accessibility srt dvb teletext ocr media cli</tags>
<projectSourceUrl>https://github.com/CCExtractor/ccextractor</projectSourceUrl>
<packageSourceUrl>https://github.com/CCExtractor/ccextractor/tree/master/packaging/chocolatey</packageSourceUrl>
<docsUrl>https://github.com/CCExtractor/ccextractor/wiki</docsUrl>
<bugTrackerUrl>https://github.com/CCExtractor/ccextractor/issues</bugTrackerUrl>
</metadata>
<files>
<file src="tools\**" target="tools" />
</files>
</package>

View File

@@ -0,0 +1,24 @@
$ErrorActionPreference = 'Stop'
$packageName = 'ccextractor'
$toolsDir = "$(Split-Path -parent $MyInvocation.MyCommand.Definition)"
# Package parameters
$packageArgs = @{
packageName = $packageName
fileType = 'MSI'
url64bit = 'https://github.com/CCExtractor/ccextractor/releases/download/v0.96.5/CCExtractor.0.96.5.msi'
checksum64 = 'FFCAB0D766180AFC2832277397CDEC885D15270DECE33A9A51947B790F1F095B'
checksumType64 = 'sha256'
silentArgs = '/quiet /norestart'
validExitCodes = @(0, 3010, 1641)
}
Install-ChocolateyPackage @packageArgs
# Add to PATH if not already there
$installPath = Join-Path $env:ProgramFiles 'CCExtractor'
if (Test-Path $installPath) {
Install-ChocolateyPath -PathToInstall $installPath -PathType 'Machine'
Write-Host "CCExtractor installed to: $installPath"
}

View File

@@ -0,0 +1,23 @@
$ErrorActionPreference = 'Stop'
$packageName = 'ccextractor'
# Get the uninstall registry key
$regKey = Get-UninstallRegistryKey -SoftwareName 'CCExtractor*'
if ($regKey) {
$silentArgs = '/quiet /norestart'
$file = $regKey.UninstallString -replace 'msiexec.exe','msiexec.exe ' -replace '/I','/X'
$packageArgs = @{
packageName = $packageName
fileType = 'MSI'
silentArgs = "$($regKey.PSChildName) $silentArgs"
file = ''
validExitCodes = @(0, 3010, 1605, 1614, 1641)
}
Uninstall-ChocolateyPackage @packageArgs
} else {
Write-Warning "CCExtractor was not found in the registry. It may have been uninstalled already."
}

View File

@@ -0,0 +1,21 @@
# yaml-language-server: $schema=https://aka.ms/winget-manifest.installer.1.9.0.schema.json
PackageIdentifier: CCExtractor.CCExtractor
PackageVersion: 0.96.5
Platform:
- Windows.Desktop
MinimumOSVersion: 10.0.0.0
InstallModes:
- interactive
- silent
- silentWithProgress
InstallerSwitches:
Silent: /quiet
SilentWithProgress: /passive
UpgradeBehavior: install
Installers:
- Architecture: x64
InstallerType: msi
InstallerUrl: https://github.com/CCExtractor/ccextractor/releases/download/v0.96.5/CCExtractor.0.96.5.msi
InstallerSha256: FFCAB0D766180AFC2832277397CDEC885D15270DECE33A9A51947B790F1F095B
ManifestType: installer
ManifestVersion: 1.9.0

View File

@@ -0,0 +1,39 @@
# yaml-language-server: $schema=https://aka.ms/winget-manifest.defaultLocale.1.9.0.schema.json
PackageIdentifier: CCExtractor.CCExtractor
PackageVersion: 0.96.5
PackageLocale: en-US
Publisher: CCExtractor Development
PublisherUrl: https://ccextractor.org
PublisherSupportUrl: https://github.com/CCExtractor/ccextractor/issues
Author: CCExtractor Development Team
PackageName: CCExtractor
PackageUrl: https://ccextractor.org
License: GPL-2.0
LicenseUrl: https://github.com/CCExtractor/ccextractor/blob/master/LICENSE.txt
Copyright: Copyright (c) CCExtractor Development
ShortDescription: A tool to extract subtitles from video files
Description: |-
CCExtractor is a tool that analyzes video files and produces independent subtitle files from the closed captions data.
Key features:
- Extracts closed captions from various video formats (MPEG, H.264, MKV, MP4, etc.)
- Supports multiple input sources including DVDs, DVRs, and live TV captures
- Outputs to multiple formats (SRT, WebVTT, SAMI, transcript, etc.)
- OCR support for bitmap-based subtitles (DVB, teletext)
- Cross-platform (Windows, Linux, macOS)
- Includes a GUI for easy operation
Moniker: ccextractor
Tags:
- subtitles
- closed-captions
- video
- extraction
- accessibility
- srt
- dvb
- teletext
- ocr
- media
ReleaseNotesUrl: https://github.com/CCExtractor/ccextractor/releases
ManifestType: defaultLocale
ManifestVersion: 1.9.0

View File

@@ -0,0 +1,6 @@
# yaml-language-server: $schema=https://aka.ms/winget-manifest.version.1.9.0.schema.json
PackageIdentifier: CCExtractor.CCExtractor
PackageVersion: 0.96.5
DefaultLocale: en-US
ManifestType: version
ManifestVersion: 1.9.0

19
snap/local/run-ccextractor.sh Executable file
View File

@@ -0,0 +1,19 @@
#!/bin/sh
set -e
# Default fallback
LIB_TRIPLET="x86_64-linux-gnu"
# Detect multiarch directory if present
for d in "$SNAP/usr/lib/"*-linux-gnu; do
if [ -d "$d" ]; then
LIB_TRIPLET=$(basename "$d")
break
fi
done
export LD_LIBRARY_PATH="$SNAP/usr/lib:\
$SNAP/usr/lib/$LIB_TRIPLET:\
$SNAP/usr/lib/$LIB_TRIPLET/blas:\
$SNAP/usr/lib/$LIB_TRIPLET/lapack:\
$SNAP/usr/lib/$LIB_TRIPLET/pulseaudio:\
${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
shift
exec "$SNAP/usr/local/bin/ccextractor" "$@"

104
snap/snapcraft.yaml Normal file
View File

@@ -0,0 +1,104 @@
name: ccextractor
base: core22
version: '0.96.5'
summary: Closed Caption Extractor
description: |
CCExtractor is a tool for extracting closed captions from video files.
website: https://www.ccextractor.org
source-code: https://github.com/CCExtractor/ccextractor
confinement: classic
apps:
ccextractor:
command: usr/local/bin/ccextractor
command-chain:
- local/run-ccextractor.sh
plugs:
- home
parts:
gpac:
plugin: make
source: https://github.com/gpac/gpac.git
source-tag: abi-16.4
build-packages:
- build-essential
- pkg-config
- zlib1g-dev
- libssl-dev
- libfreetype6-dev
- libjpeg-dev
- libpng-dev
override-build: |
set -eux
./configure --prefix=/usr
make -j$(nproc)
make DESTDIR=$SNAPCRAFT_PART_INSTALL install-lib
sed -i "s|^prefix=.*|prefix=$SNAPCRAFT_STAGE/usr|" $SNAPCRAFT_PART_INSTALL/usr/lib/pkgconfig/gpac.pc
stage:
- usr/lib/libgpac*
- usr/lib/pkgconfig/gpac.pc
- usr/include/gpac
ccextractor:
after: [gpac]
plugin: cmake
source: .
source-subdir: src
build-environment:
- PKG_CONFIG_PATH: "$SNAPCRAFT_STAGE/usr/lib/pkgconfig:$PKG_CONFIG_PATH"
build-snaps:
- cmake/latest/stable
- rustup/latest/stable
build-packages:
- build-essential
- pkg-config
- clang
- llvm-dev
- libclang-dev
- libzvbi-dev
- libtesseract-dev
- libavcodec-dev
- libavformat-dev
- libavdevice-dev
- libavfilter-dev
- libswscale-dev
- libx11-dev
- libxcb1-dev
- libxcb-shm0-dev
- libpng-dev
- zlib1g-dev
- libblas3
- liblapack3
stage-packages:
- libzvbi0
- libfreetype6
- libpng16-16
- libprotobuf-c1
- libutf8proc2
- libgl1
- libglu1-mesa
- libavcodec58
- libavformat58
- libavutil56
- libavdevice58
- libavfilter7
- libswscale5
- libjpeg-turbo8
- libvorbis0a
- libtheora0
- libxvidcore4
- libfaad2
- libmad0
- liba52-0.7.4
- libpulse0
- pulseaudio-utils
override-build: |
set -eux
rustup toolchain install stable
rustup default stable
export PATH="$HOME/.cargo/bin:$PATH"
snapcraftctl build
install -D -m 0755 \
$SNAPCRAFT_PROJECT_DIR/snap/local/run-ccextractor.sh \
$SNAPCRAFT_PART_INSTALL/local/run-ccextractor.sh

View File

@@ -9,7 +9,7 @@ option (WITH_HARDSUBX "Build with support for burned-in subtitles" OFF)
# Version number
set (CCEXTRACTOR_VERSION_MAJOR 0)
set (CCEXTRACTOR_VERSION_MINOR 89)
set (CCEXTRACTOR_VERSION_MINOR 96)
# Get project directory
get_filename_component(BASE_PROJ_DIR ../ ABSOLUTE)
@@ -255,4 +255,13 @@ endif (PKG_CONFIG_FOUND)
target_link_libraries (ccextractor ${EXTRA_LIBS})
target_include_directories (ccextractor PUBLIC ${EXTRA_INCLUDES})
# ccx_rust (Rust) calls C functions from ccx (like decode_vbi).
# Force the linker to pull these symbols from ccx before processing ccx_rust.
if (NOT WIN32 AND NOT APPLE)
target_link_options (ccextractor PRIVATE
-Wl,--undefined=decode_vbi
-Wl,--undefined=do_cb
-Wl,--undefined=store_hdcc)
endif()
install (TARGETS ccextractor DESTINATION bin)

View File

@@ -202,6 +202,12 @@ int start_ccx()
if (!ret)
ret = tmp;
break;
case CCX_SM_SCC:
mprint("\rAnalyzing data in SCC (Scenarist Closed Caption) mode\n");
tmp = raw_loop(ctx);
if (!ret)
ret = tmp;
break;
case CCX_SM_RCWT:
mprint("\rAnalyzing data in CCExtractor's binary format\n");
tmp = rcwt_loop(ctx);
@@ -429,6 +435,9 @@ int main(int argc, char *argv[])
int compile_ret = ccxr_parse_parameters(argc, argv);
// Update the Rust logger target after parsing so --quiet is respected
ccxr_update_logger_target();
if (compile_ret == EXIT_NO_INPUT_FILES)
{
print_usage();

View File

@@ -1,9 +1,9 @@
cmake_policy (SET CMP0037 NEW)
if(MSVC)
set (CMAKE_C_FLAGS "-W3 /wd4005 /wd4996")
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -W3 /wd4005 /wd4996")
else (MSVC)
set (CMAKE_C_FLAGS "-Wall -Wno-pointer-sign -g -std=gnu99")
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wno-pointer-sign -g -std=gnu99")
endif(MSVC)
if(WIN32)

View File

@@ -379,11 +379,10 @@ void sei_rbsp(struct avc_ctx *ctx, unsigned char *seibuf, unsigned char *seiend)
}
else
{
// TODO: This really really looks bad
mprint("WARNING: Unexpected SEI unit length...trying to continue.");
temp_debug = 1;
mprint("\n Failed block (at sei_rbsp) was:\n");
dump(CCX_DMT_GENERIC_NOTICES, (unsigned char *)seibuf, seiend - seibuf, 0, 0);
// Unexpected SEI length - common with malformed streams, don't spam output
dbg_print(CCX_DMT_VERBOSE, "WARNING: Unexpected SEI unit length (parsed to %p, expected %p)...trying to continue.\n",
(void *)tbuf, (void *)(seiend - 1));
dump(CCX_DMT_VERBOSE, (unsigned char *)seibuf, seiend - seibuf, 0, 0);
ctx->num_unexpected_sei_length++;
}
@@ -393,20 +392,24 @@ void sei_rbsp(struct avc_ctx *ctx, unsigned char *seibuf, unsigned char *seiend)
unsigned char *sei_message(struct avc_ctx *ctx, unsigned char *seibuf, unsigned char *seiend)
{
int payload_type = 0;
while (*seibuf == 0xff)
while (seibuf < seiend && *seibuf == 0xff)
{
payload_type += 255;
seibuf++;
}
if (seibuf >= seiend)
return NULL;
payload_type += *seibuf;
seibuf++;
int payload_size = 0;
while (*seibuf == 0xff)
while (seibuf < seiend && *seibuf == 0xff)
{
payload_size += 255;
seibuf++;
}
if (seibuf >= seiend)
return NULL;
payload_size += *seibuf;
seibuf++;
@@ -904,10 +907,10 @@ void seq_parameter_set_rbsp(struct avc_ctx *ctx, unsigned char *seqbuf, unsigned
dvprint("vcl_hrd_parameters_present_flag= %llX\n", tmp1);
if (tmp)
{
// TODO.
mprint("vcl_hrd. Not implemented for now. Hopefully not needed. Skipping rest of NAL\n");
// VCL HRD parameters are for video buffering compliance, not needed for caption extraction.
// Just skip and continue - this doesn't affect our ability to extract captions.
mprint("Skipping VCL HRD parameters (not needed for caption extraction)\n");
ctx->num_vcl_hrd++;
// exit(1);
}
if (tmp || tmp1)
{
@@ -993,9 +996,9 @@ void slice_header(struct encoder_ctx *enc_ctx, struct lib_cc_decode *dec_ctx, un
if (nal_unit_type == 5)
{
// idr_pic_id: Read to advance bitstream position; value not needed for caption extraction
tmp = read_exp_golomb_unsigned(&q1);
dvprint("idr_pic_id= % 4lld (%#llX)\n", tmp, tmp);
// TODO
}
if (dec_ctx->avc_ctx->pic_order_cnt_type == 0)
{

View File

@@ -212,6 +212,7 @@ enum ccx_stream_mode_enum
CCX_SM_GXF = 11,
CCX_SM_MKV = 12,
CCX_SM_MXF = 13,
CCX_SM_SCC = 14, // Scenarist Closed Caption input
CCX_SM_AUTODETECT = 16
};

View File

@@ -74,6 +74,8 @@ void init_options(struct ccx_s_options *options)
options->ocr_oem = -1; // By default, OEM mode depends on the tesseract version
options->psm = 3; // Default PSM mode (3 is the default tesseract as well)
options->ocr_quantmode = 0; // No quantization (better OCR accuracy for DVB subtitles)
options->ocr_line_split = 0; // By default, don't split images into lines (pending testing)
options->ocr_blacklist = 1; // By default, use character blacklist to prevent common OCR errors (| vs I, etc.)
options->mkvlang = NULL; // By default, all the languages are extracted
options->ignore_pts_jumps = 1;
options->analyze_video_stream = 0;
@@ -139,7 +141,9 @@ void init_options(struct ccx_s_options *options)
options->enc_cfg.services_charsets = NULL;
options->enc_cfg.all_services_charset = NULL;
options->enc_cfg.with_semaphore = 0;
options->enc_cfg.force_dropframe = 0; // Assume No Drop Frame for MCC Encode.
options->enc_cfg.force_dropframe = 0; // Assume No Drop Frame for MCC Encode.
options->enc_cfg.scc_framerate = 0; // Default: 29.97fps for SCC output
options->enc_cfg.scc_accurate_timing = 0; // Default: off for backwards compatibility (issue #1120)
options->enc_cfg.extract_only_708 = 0;
options->settings_dtvcc.enabled = 0;
@@ -152,6 +156,8 @@ void init_options(struct ccx_s_options *options)
options->settings_dtvcc.services_enabled, 0,
CCX_DTVCC_MAX_SERVICES * sizeof(options->settings_dtvcc.services_enabled[0]));
options->scc_framerate = 0; // Default: 29.97fps
#ifdef WITH_LIBCURL
options->curlposturl = NULL;
#endif

View File

@@ -75,6 +75,10 @@ struct encoder_cfg
// MCC File
int force_dropframe; // 1 if dropframe frame count should be used. defaults to no drop frame.
// SCC output framerate
int scc_framerate; // SCC output framerate: 0=29.97 (default), 1=24, 2=25, 3=30
int scc_accurate_timing; // If 1, use bandwidth-aware timing for broadcast compliance (issue #1120)
// text -> png (text render)
char *render_font; // The font used to render text if needed (e.g. teletext->spupng)
char *render_font_italics;
@@ -149,6 +153,8 @@ struct ccx_s_options // Options from user parameters
int ocr_oem; // The Tesseract OEM mode, could be 0 (default), 1 or 2
int psm; // The Tesseract PSM mode, could be between 0 and 13. 3 is tesseract default
int ocr_quantmode; // How to quantize the bitmap before passing to to tesseract (0=no quantization at all, 1=CCExtractor's internal)
int ocr_line_split; // If 1, split images into lines before OCR (uses PSM 7 for better accuracy)
int ocr_blacklist; // If 1, use character blacklist to prevent common OCR errors (default: enabled)
char *mkvlang; // The name of the language stream for MKV
int analyze_video_stream; // If 1, the video stream will be processed even if we're using a different one for subtitles.
@@ -195,6 +201,7 @@ struct ccx_s_options // Options from user parameters
int multiprogram;
int out_interval;
int segment_on_key_frames_only;
int scc_framerate; // SCC input framerate: 0=29.97 (default), 1=24, 2=25, 3=30
#ifdef WITH_LIBCURL
char *curlposturl;
#endif

View File

@@ -201,6 +201,9 @@ void delete_to_end_of_row(ccx_decoder_608_context *context)
{
if (context->mode != MODE_TEXT)
{
if (context->cursor_row >= CCX_DECODER_608_SCREEN_ROWS)
return;
struct eia608_screen *use_buffer = get_writing_buffer(context);
for (int i = context->cursor_column; i <= CCX_DECODER_608_SCREEN_WIDTH - 1; i++)
{
@@ -221,6 +224,10 @@ void write_char(const unsigned char c, ccx_decoder_608_context *context)
/* printf ("\rWriting char [%c] at %s:%d:%d\n",c,
use_buffer == &wb->data608->buffer1?"B1":"B2",
wb->data608->cursor_row,wb->data608->cursor_column); */
if (context->cursor_row >= CCX_DECODER_608_SCREEN_ROWS || context->cursor_column >= CCX_DECODER_608_SCREEN_WIDTH)
return;
use_buffer->characters[context->cursor_row][context->cursor_column] = c;
use_buffer->colors[context->cursor_row][context->cursor_column] = context->current_color;
use_buffer->fonts[context->cursor_row][context->cursor_column] = context->font;
@@ -316,10 +323,20 @@ int write_cc_buffer(ccx_decoder_608_context *context, struct cc_subtitle *sub)
if (!data->empty && context->output_format != CCX_OF_NULL)
{
struct eia608_screen *new_data = (struct eia608_screen *)realloc(sub->data, (sub->nb_data + 1) * sizeof(*data));
size_t new_size;
if (sub->nb_data + 1 > SIZE_MAX / sizeof(struct eia608_screen))
{
ccx_common_logging.log_ftn("Too many screens, cannot allocate more memory.\n");
return 0;
}
new_size = (sub->nb_data + 1) * sizeof(struct eia608_screen);
struct eia608_screen *new_data = (struct eia608_screen *)realloc(sub->data, new_size);
if (!new_data)
{
ccx_common_logging.log_ftn("No Memory left");
ccx_common_logging.log_ftn("Out of memory while reallocating screen buffer\n");
return 0;
}
sub->data = new_data;
@@ -386,10 +403,20 @@ int write_cc_line(ccx_decoder_608_context *context, struct cc_subtitle *sub)
if (!data->empty)
{
struct eia608_screen *new_data = (struct eia608_screen *)realloc(sub->data, (sub->nb_data + 1) * sizeof(*data));
size_t new_size;
if (sub->nb_data + 1 > SIZE_MAX / sizeof(struct eia608_screen))
{
ccx_common_logging.log_ftn("Too many screens, cannot allocate more memory.\n");
return 0;
}
new_size = (sub->nb_data + 1) * sizeof(struct eia608_screen);
struct eia608_screen *new_data = (struct eia608_screen *)realloc(sub->data, new_size);
if (!new_data)
{
ccx_common_logging.log_ftn("No Memory left");
ccx_common_logging.log_ftn("Out of memory while reallocating screen buffer\n");
return 0;
}
sub->data = new_data;

View File

@@ -998,6 +998,14 @@ void dtvcc_handle_DFx_DefineWindow(dtvcc_service_decoder *decoder, int window_id
int row_count = (data[4] & 0xf) + 1; // according to CEA-708-D
int anchor_point = data[4] >> 4;
int col_count = (data[5] & 0x3f) + 1; // according to CEA-708-D
if (row_count > CCX_DTVCC_MAX_ROWS || col_count > CCX_DTVCC_MAX_COLUMNS)
{
ccx_common_logging.log_ftn("[CEA-708] Invalid window size %dx%d (max %dx%d), rejecting window definition\n",
row_count, col_count, CCX_DTVCC_MAX_ROWS, CCX_DTVCC_MAX_COLUMNS);
return;
}
int pen_style = data[6] & 0x7;
int win_style = (data[6] >> 3) & 0x7;
@@ -1341,6 +1349,14 @@ void dtvcc_handle_SPL_SetPenLocation(dtvcc_service_decoder *decoder, unsigned ch
}
dtvcc_window *window = &decoder->windows[decoder->current_window];
if (row >= window->row_count || col >= window->col_count)
{
ccx_common_logging.log_ftn("[CEA-708] dtvcc_handle_SPL_SetPenLocation: "
"Invalid pen location %d:%d for window size %dx%d, rejecting command\n",
row, col, window->row_count, window->col_count);
return;
}
window->pen_row = row;
window->pen_column = col;
}
@@ -1479,7 +1495,12 @@ int dtvcc_handle_C0(dtvcc_ctx *dtvcc,
else if (c0 >= 0x18 && c0 <= 0x1F)
{
if (c0 == DTVCC_C0_P16) // PE16
dtvcc_handle_C0_P16(decoder, data + 1);
{
if (data_length >= 3)
dtvcc_handle_C0_P16(decoder, data + 1);
else
ccx_common_logging.debug_ftn(CCX_DMT_708, "[CEA-708] dtvcc_handle_C0: Not enough data for P16\n");
}
len = 3;
}
if (len == -1)
@@ -1633,6 +1654,9 @@ int dtvcc_handle_extended_char(dtvcc_service_decoder *decoder, unsigned char *da
ccx_common_logging.debug_ftn(CCX_DMT_708, "[CEA-708] In dtvcc_handle_extended_char, "
"first data code: [%c], length: [%u]\n",
data[0], data_length);
if (data_length < 1)
return 0;
unsigned char c = 0x20; // Default to space
unsigned char code = data[0];
if (/* data[i]>=0x00 && */ code <= 0x1F) // Comment to silence warning
@@ -1701,8 +1725,17 @@ void dtvcc_process_service_block(dtvcc_ctx *dtvcc,
}
else // Use extended set
{
used = dtvcc_handle_extended_char(decoder, data + i + 1, data_length - 1);
used++; // Since we had DTVCC_C0_EXT1
if (i + 1 >= data_length)
{
used = 1; // skip EXT1
}
else
{
used = dtvcc_handle_extended_char(decoder,
data + i + 1,
data_length - i - 1) +
1;
}
}
i += used;
}
@@ -1754,6 +1787,12 @@ void dtvcc_process_current_packet(dtvcc_ctx *dtvcc, int len)
if (service_number == 7) // There is an extended header
{
if (pos + 1 >= dtvcc->current_packet + len)
{
ccx_common_logging.debug_ftn(CCX_DMT_708, "[CEA-708] dtvcc_process_current_packet: "
"Truncated extended header, stopping.\n");
break;
}
pos++;
service_number = (pos[0] & 0x3F); // 6 more significant bits
// printf ("Extended header: Service number: [%d]\n",service_number);

View File

@@ -224,7 +224,12 @@ int do_cb(struct lib_cc_decode *ctx, unsigned char *cc_block, struct cc_subtitle
void dinit_cc_decode(struct lib_cc_decode **ctx)
{
struct lib_cc_decode *lctx = *ctx;
#ifndef DISABLE_RUST
ccxr_dtvcc_free(lctx->dtvcc_rust);
lctx->dtvcc_rust = NULL;
#else
dtvcc_free(&lctx->dtvcc);
#endif
dinit_avc(&lctx->avc_ctx);
ccx_decoder_608_dinit_library(&lctx->context_cc608_field_1);
ccx_decoder_608_dinit_library(&lctx->context_cc608_field_2);
@@ -294,10 +299,16 @@ struct lib_cc_decode *init_cc_decode(struct ccx_decoders_common_settings_t *sett
ctx->no_rollup = setting->no_rollup;
ctx->noscte20 = setting->noscte20;
#ifndef DISABLE_RUST
ctx->dtvcc_rust = ccxr_dtvcc_init(setting->settings_dtvcc);
ctx->dtvcc = NULL; // Not used when Rust is enabled
#else
ctx->dtvcc = dtvcc_init(setting->settings_dtvcc);
if (!ctx->dtvcc)
fatal(EXIT_NOT_ENOUGH_MEMORY, "In init_cc_decode: Out of memory initializing dtvcc.");
ctx->dtvcc->is_active = setting->settings_dtvcc->enabled;
ctx->dtvcc_rust = NULL;
#endif
if (setting->codec == CCX_CODEC_ATSC_CC)
{
@@ -477,6 +488,13 @@ void flush_cc_decode(struct lib_cc_decode *ctx, struct cc_subtitle *sub)
}
}
}
#ifndef DISABLE_RUST
if (ccxr_dtvcc_is_active(ctx->dtvcc_rust))
{
ctx->current_field = 3;
ccxr_flush_active_decoders(ctx->dtvcc_rust);
}
#else
if (ctx->dtvcc->is_active)
{
for (int i = 0; i < CCX_DTVCC_MAX_SERVICES; i++)
@@ -491,6 +509,7 @@ void flush_cc_decode(struct lib_cc_decode *ctx, struct cc_subtitle *sub)
}
}
}
#endif
}
struct encoder_ctx *copy_encoder_context(struct encoder_ctx *ctx)
{

View File

@@ -32,4 +32,10 @@ struct cc_subtitle *copy_subtitle(struct cc_subtitle *sub);
void free_encoder_context(struct encoder_ctx *ctx);
void free_decoder_context(struct lib_cc_decode *ctx);
void free_subtitle(struct cc_subtitle *sub);
#ifndef DISABLE_RUST
// Rust FFI function to flush active CEA-708 service decoders
extern void ccxr_flush_active_decoders(void *dtvcc_rust);
#endif
#endif

View File

@@ -724,16 +724,17 @@ static int parse_csi(ISDBSubContext *ctx, const uint8_t *buf, int len)
// Copy buf in arg
for (i = 0; *buf != 0x20; i++)
{
if (i >= (sizeof(arg)) + 1)
if (i >= sizeof(arg) - 1)
{
isdb_log("UnExpected CSI %d >= %d", sizeof(arg) + 1, i);
isdb_log("UnExpected CSI: too long");
break;
}
arg[i] = *buf;
buf++;
}
/* ignore terminating 0x20 character */
arg[i] = *buf++;
if (i < sizeof(arg))
arg[i] = *buf++;
switch (*buf)
{

View File

@@ -208,6 +208,7 @@ struct lib_cc_decode
int false_pict_header;
dtvcc_ctx *dtvcc;
void *dtvcc_rust; // Persistent Rust CEA-708 decoder context
int current_field;
// Analyse/use the picture information
int maxtref; // Use to remember the temporal reference number

View File

@@ -285,6 +285,9 @@ static void ccx_demuxer_print_cfg(struct ccx_demuxer *ctx)
case CCX_SM_MXF:
mprint("MXF");
break;
case CCX_SM_SCC:
mprint("SCC");
break;
#ifdef WTV_DEBUG
case CCX_SM_HEX_DUMP:
mprint("Hex");
@@ -348,7 +351,6 @@ struct ccx_demuxer *init_demuxer(void *parent, struct demuxer_cfg *cfg)
{
ctx->pinfo[i].got_important_streams_min_pts[j] = UINT64_MAX;
}
ctx->pinfo[i].initialized_ocr = 0;
ctx->pinfo[i].version = 0xFF; // Not real in a real stream since it's 5 bits. FF => Not initialized
}

View File

@@ -35,7 +35,6 @@ struct program_info
{
int pid;
int program_number;
int initialized_ocr; // Avoid initializing the OCR more than once
uint8_t analysed_PMT_once : 1;
uint8_t version;
uint8_t saved_section[1021];

View File

@@ -75,12 +75,15 @@ enum MXFLocalTag
void update_tid_lut(struct MXFContext *ctx, uint32_t track_id, uint8_t *track_number, struct ccx_rational edit_rate)
{
int i;
debug("update_tid_lut: track_id=%u (0x%x), track_number=%02X%02X%02X%02X, cap_track_id=%u\n",
track_id, track_id, track_number[0], track_number[1], track_number[2], track_number[3], ctx->cap_track_id);
// Update essence element key if we have track Id of caption
if (ctx->cap_track_id == track_id)
{
memcpy(ctx->cap_essence_key, mxf_essence_element_key, 12);
memcpy(ctx->cap_essence_key + 12, track_number, 4);
ctx->edit_rate = edit_rate;
debug("MXF: Found caption track, track_id=%u\n", track_id);
}
for (i = 0; i < ctx->nb_tracks; i++)
@@ -248,6 +251,7 @@ static int mxf_read_vanc_vbi_desc(struct ccx_demuxer *demux, uint64_t size)
{
case MXF_TAG_LTRACK_ID:
ctx->cap_track_id = buffered_get_be32(demux);
debug("MXF: VANC/VBI descriptor found, Linked Track ID = %u\n", ctx->cap_track_id);
update_cap_essence_key(ctx, ctx->cap_track_id);
break;
default:
@@ -304,6 +308,17 @@ static int mxf_read_cdp_data(struct ccx_demuxer *demux, int size, struct demuxer
log("Incomplete CDP packet\n");
ret = buffered_read(demux, data->buffer + data->len, cc_count * 3);
// Log first few bytes of cc_data for debugging
if (cc_count > 0)
{
unsigned char *cc_ptr = data->buffer + data->len;
debug("cc_data (first 6 triplets): ");
for (int j = 0; j < (cc_count < 6 ? cc_count : 6); j++)
{
debug("%02X%02X%02X ", cc_ptr[j * 3], cc_ptr[j * 3 + 1], cc_ptr[j * 3 + 2]);
}
debug("\n");
}
data->len += cc_count * 3;
demux->past += cc_count * 3;
len += ret;
@@ -361,7 +376,10 @@ static int mxf_read_vanc_data(struct ccx_demuxer *demux, uint64_t size, struct d
// uint8_t count; /* Currently unused */
if (size < 19)
{
debug("VANC data too small: %" PRIu64 " < 19\n", size);
goto error;
}
ret = buffered_read(demux, vanc_header, 16);
@@ -370,31 +388,39 @@ static int mxf_read_vanc_data(struct ccx_demuxer *demux, uint64_t size, struct d
return CCX_EOF;
len += ret;
debug("VANC header: num_packets=%d, line=0x%02x%02x, wrap_type=0x%02x, sample_config=0x%02x\n",
vanc_header[1], vanc_header[2], vanc_header[3], vanc_header[4], vanc_header[5]);
for (int i = 0; i < vanc_header[1]; i++)
{
DID = buffered_get_byte(demux);
len++;
debug("VANC packet %d: DID=0x%02x\n", i, DID);
if (!(DID == 0x61 || DID == 0x80))
{
debug("DID 0x%02x not recognized as caption DID\n", DID);
goto error;
}
SDID = buffered_get_byte(demux);
len++;
debug("VANC packet %d: SDID=0x%02x\n", i, SDID);
if (SDID == 0x01)
debug("Caption Type 708\n");
else if (SDID == 0x02)
debug("Caption Type 608\n");
cdp_size = buffered_get_byte(demux);
debug("VANC packet %d: cdp_size=%d\n", i, cdp_size);
if (cdp_size + 19 > size)
{
debug("Incomplete cdp(%d) in anc data(%d)\n", cdp_size, size);
log("Incomplete cdp(%d) in anc data(%" PRIu64 ")\n", cdp_size, size);
goto error;
}
len++;
ret = mxf_read_cdp_data(demux, cdp_size, data);
debug("mxf_read_cdp_data returned %d, data->len=%d\n", ret, data->len);
len += ret;
// len += (3 + count + 4);
}
@@ -411,15 +437,33 @@ static int mxf_read_essence_element(struct ccx_demuxer *demux, uint64_t size, st
int ret;
struct MXFContext *ctx = demux->private_data;
debug("mxf_read_essence_element: ctx->type=%d (ANC=%d, VBI=%d), size=%" PRIu64 "\n",
ctx->type, MXF_CT_ANC, MXF_CT_VBI, size);
if (ctx->type == MXF_CT_ANC)
{
data->bufferdatatype = CCX_RAW_TYPE;
ret = mxf_read_vanc_data(demux, size, data);
data->pts = ctx->cap_count;
debug("mxf_read_vanc_data returned %d, data->len=%d\n", ret, data->len);
// Calculate PTS in 90kHz units from frame count and edit rate
// edit_rate is frames per second (e.g., 25/1 for 25fps)
// PTS = frame_count * 90000 / fps = frame_count * 90000 * edit_rate.den / edit_rate.num
if (ctx->edit_rate.num > 0 && ctx->edit_rate.den > 0)
{
data->pts = (int64_t)ctx->cap_count * 90000 * ctx->edit_rate.den / ctx->edit_rate.num;
}
else
{
// Fallback to 25fps if edit_rate not set
data->pts = (int64_t)ctx->cap_count * 90000 / 25;
}
debug("Frame %d, PTS=%" PRId64 " (edit_rate=%d/%d)\n",
ctx->cap_count, data->pts, ctx->edit_rate.num, ctx->edit_rate.den);
ctx->cap_count++;
}
else
{
debug("Skipping essence element (not ANC type)\n");
ret = buffered_skip(demux, size);
demux->past += ret;
}
@@ -514,6 +558,7 @@ static int read_packet(struct ccx_demuxer *demux, struct demuxer_data *data)
KLVPacket klv;
const MXFReadTableEntry *reader;
struct MXFContext *ctx = demux->private_data;
static int first_essence_logged = 0;
while ((ret = klv_read_packet(&klv, demux)) == 0)
{
debug("Key %02X%02X%02X%02X%02X%02X%02X%02X.%02X%02X%02X%02X%02X%02X%02X%02X size %" PRIu64 "\n",
@@ -523,8 +568,25 @@ static int read_packet(struct ccx_demuxer *demux, struct demuxer_data *data)
klv.key[12], klv.key[13], klv.key[14], klv.key[15],
klv.length);
// Check if this is an essence element key (first 12 bytes match)
if (IS_KLV_KEY(klv.key, mxf_essence_element_key) && !first_essence_logged)
{
debug("MXF: First essence element key: %02X%02X%02X%02X%02X%02X%02X%02X.%02X%02X%02X%02X%02X%02X%02X%02X\n",
klv.key[0], klv.key[1], klv.key[2], klv.key[3],
klv.key[4], klv.key[5], klv.key[6], klv.key[7],
klv.key[8], klv.key[9], klv.key[10], klv.key[11],
klv.key[12], klv.key[13], klv.key[14], klv.key[15]);
debug("MXF: cap_essence_key: %02X%02X%02X%02X%02X%02X%02X%02X.%02X%02X%02X%02X%02X%02X%02X%02X\n",
ctx->cap_essence_key[0], ctx->cap_essence_key[1], ctx->cap_essence_key[2], ctx->cap_essence_key[3],
ctx->cap_essence_key[4], ctx->cap_essence_key[5], ctx->cap_essence_key[6], ctx->cap_essence_key[7],
ctx->cap_essence_key[8], ctx->cap_essence_key[9], ctx->cap_essence_key[10], ctx->cap_essence_key[11],
ctx->cap_essence_key[12], ctx->cap_essence_key[13], ctx->cap_essence_key[14], ctx->cap_essence_key[15]);
first_essence_logged = 1;
}
if (IS_KLV_KEY(klv.key, ctx->cap_essence_key))
{
debug("MXF: Found ANC essence element, size=%" PRIu64 "\n", klv.length);
mxf_read_essence_element(demux, klv.length, data);
if (data->len > 0)
break;
@@ -566,8 +628,15 @@ int ccx_mxf_getmoredata(struct lib_ccx_ctx *ctx, struct demuxer_data **ppdata)
data->program_number = 1;
data->stream_pid = 1;
data->codec = CCX_CODEC_ATSC_CC;
data->tb.num = 1001;
data->tb.den = 30000;
// PTS is already calculated in 90kHz units by mxf_read_essence_element
data->tb.num = 1;
data->tb.den = 90000;
// Enable CEA-708 (DTVCC) decoder for MXF files with VANC captions
if (ctx->dec_global_setting && ctx->dec_global_setting->settings_dtvcc)
{
ctx->dec_global_setting->settings_dtvcc->enabled = 1;
}
}
else
{
@@ -576,6 +645,11 @@ int ccx_mxf_getmoredata(struct lib_ccx_ctx *ctx, struct demuxer_data **ppdata)
ret = read_packet(ctx->demux_ctx, data);
// Ensure timebase is 90kHz since PTS is calculated in 90kHz units
// CDP parsing may have set a frame-based timebase which would cause incorrect conversion
data->tb.num = 1;
data->tb.den = 90000;
return ret;
}

View File

@@ -25,7 +25,7 @@ void dtvcc_process_data(struct dtvcc_ctx *dtvcc,
ccx_common_logging.debug_ftn(CCX_DMT_708, "[CEA-708] dtvcc_process_data: DTVCC Channel Packet Data\n");
if (cc_valid && dtvcc->is_current_packet_header_parsed)
{
if (dtvcc->current_packet_length > 253)
if (dtvcc->current_packet_length + 2 > CCX_DTVCC_MAX_PACKET_LENGTH)
{
ccx_common_logging.debug_ftn(CCX_DMT_708, "[CEA-708] dtvcc_process_data: "
"Warning: Legal packet size exceeded (1), data not added.\n");
@@ -51,7 +51,7 @@ void dtvcc_process_data(struct dtvcc_ctx *dtvcc,
ccx_common_logging.debug_ftn(CCX_DMT_708, "[CEA-708] dtvcc_process_data: DTVCC Channel Packet Start\n");
if (cc_valid)
{
if (dtvcc->current_packet_length > CCX_DTVCC_MAX_PACKET_LENGTH - 1)
if (dtvcc->current_packet_length + 2 > CCX_DTVCC_MAX_PACKET_LENGTH)
{
ccx_common_logging.debug_ftn(CCX_DMT_708, "[CEA-708] dtvcc_process_data: "
"Warning: Legal packet size exceeded (2), data not added.\n");

View File

@@ -10,4 +10,14 @@ void dtvcc_process_data(struct dtvcc_ctx *dtvcc,
dtvcc_ctx *dtvcc_init(ccx_decoder_dtvcc_settings *opts);
void dtvcc_free(dtvcc_ctx **);
#ifndef DISABLE_RUST
// Rust FFI functions for persistent CEA-708 decoder
extern void *ccxr_dtvcc_init(struct ccx_decoder_dtvcc_settings *settings_dtvcc);
extern void ccxr_dtvcc_free(void *dtvcc_rust);
extern void ccxr_dtvcc_process_data(void *dtvcc_rust, const unsigned char cc_valid,
const unsigned char cc_type, const unsigned char data1, const unsigned char data2);
extern int ccxr_dtvcc_is_active(void *dtvcc_rust);
extern void ccxr_dtvcc_set_active(void *dtvcc_rust, int active);
#endif
#endif // CCEXTRACTOR_CCX_DTVCC_H

View File

@@ -775,6 +775,7 @@ struct encoder_ctx *init_encoder(struct encoder_cfg *opt)
return NULL;
}
ctx->in_fileformat = opt->in_format;
ctx->is_pal = (opt->in_format == 2);
/** used in case of SUB_EOD_MARKER */
ctx->prev_start = -1;
@@ -840,6 +841,10 @@ struct encoder_ctx *init_encoder(struct encoder_cfg *opt)
ctx->segment_pending = 0;
ctx->segment_last_key_frame = 0;
ctx->nospupngocr = opt->nospupngocr;
ctx->scc_framerate = opt->scc_framerate;
ctx->scc_accurate_timing = opt->scc_accurate_timing;
ctx->scc_last_transmission_end = 0;
ctx->scc_last_display_end = 0;
// Initialize teletext multi-page output arrays (issue #665)
ctx->tlt_out_count = 0;
@@ -1045,6 +1050,28 @@ int encode_sub(struct encoder_ctx *context, struct cc_subtitle *sub)
freep(&sub->data);
break;
case CC_BITMAP:;
// Apply subs_delay to bitmap subtitles (DVB, DVD, etc.)
// This is the same as what's done for CC_608 above
sub->start_time += context->subs_delay;
sub->end_time += context->subs_delay;
// After adding delay, if start/end time is lower than 0, skip this subtitle
if (sub->start_time < 0 || sub->end_time <= 0)
{
// Free bitmap data to avoid memory leak
if (sub->datatype == CC_DATATYPE_DVB)
{
struct cc_bitmap *bitmap_tmp = (struct cc_bitmap *)sub->data;
if (bitmap_tmp)
{
freep(&bitmap_tmp->data0);
freep(&bitmap_tmp->data1);
}
}
freep(&sub->data);
sub->nb_data = 0;
break;
}
#ifdef ENABLE_OCR
struct cc_bitmap *rect;

View File

@@ -153,6 +153,14 @@ struct encoder_ctx
unsigned int cdp_hdr_seq;
int force_dropframe;
// SCC output framerate
int scc_framerate; // SCC output framerate: 0=29.97 (default), 1=24, 2=25, 3=30
// SCC accurate timing (issue #1120)
int scc_accurate_timing; // If 1, use bandwidth-aware timing for broadcast compliance
LLONG scc_last_transmission_end; // When last caption transmission ends (ms)
LLONG scc_last_display_end; // When last caption display ends (ms)
int new_sentence; // Capitalize next letter?
int program_number;
@@ -174,12 +182,12 @@ struct encoder_ctx
// OCR in SPUPNG
int nospupngocr;
int is_pal;
// Teletext multi-page output (issue #665)
struct ccx_s_write *tlt_out[MAX_TLT_PAGES_EXTRACT]; // Output files per teletext page
uint16_t tlt_out_pages[MAX_TLT_PAGES_EXTRACT]; // Page numbers for each output slot
unsigned int tlt_srt_counter[MAX_TLT_PAGES_EXTRACT]; // SRT counter per page
int tlt_out_count; // Number of teletext output files
int tlt_out_count; // Number of teletext output files
};
#define INITIAL_ENC_BUFFER_CAPACITY 2048

View File

@@ -10,6 +10,171 @@ unsigned char odd_parity(const unsigned char byte)
return byte | !(cc608_parity(byte) % 2) << 7;
}
/**
* SCC Accurate Timing Implementation (Issue #1120)
*
* EIA-608 bandwidth constraints:
* - 2 bytes per frame at 29.97 FPS (or configured frame rate)
* - Captions must be pre-loaded before display time
* - Each control code takes 2 bytes (sent twice for reliability = 4 bytes total)
* - Text characters take 1 byte each
*/
// Get frame rate value from scc_framerate setting
// 0=29.97 (default), 1=24, 2=25, 3=30
static float get_scc_fps_internal(int scc_framerate)
{
switch (scc_framerate)
{
case 1:
return 24.0f;
case 2:
return 25.0f;
case 3:
return 30.0f;
default:
return 29.97f;
}
}
/**
* Calculate total bytes needed to transmit a caption
*
* Byte costs:
* - Control code (RCL, EOC, ENM, EDM): 2 bytes x 2 (sent twice) = 4 bytes
* - Preamble code: 2 bytes x 2 = 4 bytes
* - Tab offset: 2 bytes x 2 = 4 bytes
* - Mid-row code (color/style): 2 bytes x 2 = 4 bytes
* - Text character: 1 byte each
* - Padding: 1 byte if odd number of text bytes
*/
static unsigned int calculate_caption_bytes(const struct eia608_screen *data)
{
unsigned int total_bytes = 0;
// RCL (Resume Caption Loading): 4 bytes
total_bytes += 4;
for (unsigned char row = 0; row < 15; ++row)
{
if (!data->row_used[row])
continue;
int first, last;
find_limit_characters(data->characters[row], &first, &last, CCX_DECODER_608_SCREEN_WIDTH);
if (first > last)
continue;
// Assume we need at least one preamble per row: 4 bytes
total_bytes += 4;
// Count characters on this row
unsigned int char_count = 0;
enum font_bits prev_font = FONT_REGULAR;
enum ccx_decoder_608_color_code prev_color = COL_WHITE;
int prev_col = -1;
for (int col = first; col <= last; ++col)
{
// Check if we need position codes
if (prev_col != col - 1 && prev_col != -1)
{
// Need preamble + possible tab offset: 4-8 bytes
total_bytes += 4;
if (col % 4 != 0)
total_bytes += 4; // Tab offset
}
// Check if we need mid-row style codes
if (data->fonts[row][col] != prev_font || data->colors[row][col] != prev_color)
{
total_bytes += 4; // Mid-row code
prev_font = data->fonts[row][col];
prev_color = data->colors[row][col];
}
// Text character
char_count++;
prev_col = col;
}
// Add text bytes (1 per character, rounded up to even)
total_bytes += char_count;
if (char_count % 2 == 1)
total_bytes++; // Padding
}
// EOC (End of Caption): 4 bytes
total_bytes += 4;
// ENM (Erase Non-displayed Memory): 4 bytes
total_bytes += 4;
return total_bytes;
}
/**
* Calculate the pre-roll start time for a caption
*
* @param display_time When the caption should appear on screen (ms)
* @param total_bytes Total bytes to transmit
* @param fps Frame rate
* @return Time to begin loading the caption (ms)
*/
static LLONG calculate_preroll_time(LLONG display_time, unsigned int total_bytes, float fps)
{
// Calculate transmission time in milliseconds
// 2 bytes per frame, so frames_needed = (total_bytes + 1) / 2
float ms_per_frame = 1000.0f / fps;
unsigned int frames_needed = (total_bytes + 1) / 2;
LLONG transmission_time_ms = (LLONG)(frames_needed * ms_per_frame);
// Add 1 frame for EOC to be sent before display
LLONG one_frame_ms = (LLONG)ms_per_frame;
LLONG preroll_start = display_time - transmission_time_ms - one_frame_ms;
// Don't go negative
if (preroll_start < 0)
preroll_start = 0;
return preroll_start;
}
/**
* Check for collision with previous caption transmission and resolve it
*
* @param context Encoder context with timing state
* @param preroll_start Proposed pre-roll start time (will be modified if collision)
* @param display_time Caption display time (may be adjusted)
* @param fps Frame rate
* @return true if timing was adjusted due to collision
*/
static bool resolve_collision(struct encoder_ctx *context, LLONG *preroll_start,
LLONG *display_time, float fps)
{
// Check if our preroll would start before previous caption finishes transmitting
// This prevents bandwidth collision but allows visual overlap (like scc_tools)
// Visual overlap is fine - the EOC command swaps buffers atomically
if (context->scc_last_transmission_end > 0 &&
*preroll_start < context->scc_last_transmission_end)
{
// Bandwidth collision detected - shift our caption forward
// Add 1 frame buffer to ensure no overlap
LLONG one_frame_ms = (LLONG)(1000.0f / fps);
LLONG new_preroll = context->scc_last_transmission_end + one_frame_ms;
LLONG shift = new_preroll - *preroll_start;
*preroll_start = new_preroll;
*display_time += shift;
return true;
}
return false;
}
struct control_code_info
{
unsigned int byte1_odd;
@@ -484,14 +649,156 @@ void write_control_code(const int fd, const unsigned char channel, const enum co
* @param row 0-14 (inclusive)
* @param column 0-31 (inclusive)
*
* //TODO: Preamble code need to take into account font as well
*
* Returns an indent-based preamble code (positions cursor at column with white color)
*/
enum control_code get_preamble_code(const unsigned char row, const unsigned char column)
{
return PREAMBLE_CC_START + 1 + (row * 8) + (column / 4);
}
/**
* Get byte2 value for a styled PAC (color/font at column 0)
* Returns 0x40-0x4F or 0x60-0x6F depending on the style
*
* @param color The color to use
* @param font The font style to use
* @param use_high_range If true, use 0x60-0x6F range instead of 0x40-0x4F
*
* PAC style encoding (byte2):
* 0x40/0x60: white, regular 0x41/0x61: white, underline
* 0x42/0x62: green, regular 0x43/0x63: green, underline
* 0x44/0x64: blue, regular 0x45/0x65: blue, underline
* 0x46/0x66: cyan, regular 0x47/0x67: cyan, underline
* 0x48/0x68: red, regular 0x49/0x69: red, underline
* 0x4a/0x6a: yellow, regular 0x4b/0x6b: yellow, underline
* 0x4c/0x6c: magenta, regular 0x4d/0x6d: magenta, underline
* 0x4e/0x6e: white, italics 0x4f/0x6f: white, italic underline
*/
static unsigned char get_styled_pac_byte2(enum ccx_decoder_608_color_code color, enum font_bits font, bool use_high_range)
{
unsigned char base = use_high_range ? 0x60 : 0x40;
unsigned char style_offset;
// Handle italics specially - they're always white
if (font == FONT_ITALICS)
return base + 0x0e;
if (font == FONT_UNDERLINED_ITALICS)
return base + 0x0f;
// Map color to base offset (0, 2, 4, 6, 8, 10, 12)
switch (color)
{
case COL_WHITE:
style_offset = 0x00;
break;
case COL_GREEN:
style_offset = 0x02;
break;
case COL_BLUE:
style_offset = 0x04;
break;
case COL_CYAN:
style_offset = 0x06;
break;
case COL_RED:
style_offset = 0x08;
break;
case COL_YELLOW:
style_offset = 0x0a;
break;
case COL_MAGENTA:
style_offset = 0x0c;
break;
default:
// For unsupported colors (black, transparent, userdefined), fall back to white
style_offset = 0x00;
break;
}
// Add 1 for underlined
if (font == FONT_UNDERLINED)
style_offset += 1;
return base + style_offset;
}
/**
* Check if the row uses high range (0x60-0x6F) or low range (0x40-0x4F) for styled PACs
* Rows that have byte2 in 0x70-0x7F range for indents use 0x60-0x6F for styles
*/
static bool row_uses_high_range(unsigned char row)
{
// Based on the preamble code table:
// Rows 2, 4, 6, 8, 10, 13, 15 use the "high" range (byte2 0x70-0x7F for indents)
// which corresponds to 0x60-0x6F for styled PACs
return (row == 1 || row == 3 || row == 5 || row == 7 || row == 9 || row == 12 || row == 14);
}
/**
* Write a styled PAC code (color/font at column 0) directly
* This is more efficient than using indent PAC + mid-row code when at column 0
*
* @param fd File descriptor
* @param channel Caption channel (1-4)
* @param row Row number (0-14)
* @param color Color to set
* @param font Font style to set
* @param disassemble If true, output assembly format
* @param bytes_written Pointer to byte counter
*/
static void write_styled_preamble(const int fd, const unsigned char channel, const unsigned char row,
enum ccx_decoder_608_color_code color, enum font_bits font,
const bool disassemble, unsigned int *bytes_written)
{
// Get the preamble code for column 0 to obtain byte1
enum control_code base_preamble = get_preamble_code(row, 0);
unsigned char byte1 = odd_parity(get_first_byte(channel, base_preamble));
// Get styled byte2
bool use_high_range = row_uses_high_range(row);
unsigned char byte2 = odd_parity(get_styled_pac_byte2(color, font, use_high_range));
check_padding(fd, disassemble, bytes_written);
if (disassemble)
{
// Output assembly format like {0100Gr} for row 1, green
const char *color_names[] = {"Wh", "Gr", "Bl", "Cy", "R", "Y", "Ma", "Wh", "Bk", "Wh"};
const char *font_suffix = "";
if (font == FONT_UNDERLINED)
font_suffix = "U";
else if (font == FONT_ITALICS)
font_suffix = "I";
else if (font == FONT_UNDERLINED_ITALICS)
font_suffix = "IU";
fdprintf(fd, "{%02d00%s%s}", row + 1, color_names[color], font_suffix);
}
else
{
if (*bytes_written % 2 == 0)
write_wrapped(fd, " ", 1);
fdprintf(fd, "%02x%02x", byte1, byte2);
}
*bytes_written += 2;
}
/**
* Check if a styled PAC can be used (when color/font differs from white/regular and column is 0)
*/
static bool can_use_styled_pac(enum ccx_decoder_608_color_code color, enum font_bits font, unsigned char column)
{
// Styled PACs can only be used at column 0
if (column != 0)
return false;
// If style is already white/regular, no need for styled PAC
if (color == COL_WHITE && font == FONT_REGULAR)
return false;
return true;
}
enum control_code get_tab_offset_code(const unsigned char column)
{
int offset = column % 4;
@@ -519,6 +826,23 @@ enum control_code get_font_code(enum font_bits font, enum ccx_decoder_608_color_
}
}
// Get frame rate value from scc_framerate setting
// 0=29.97 (default), 1=24, 2=25, 3=30
static float get_scc_fps(int scc_framerate)
{
switch (scc_framerate)
{
case 1:
return 24.0f;
case 2:
return 25.0f;
case 3:
return 30.0f;
default:
return 29.97f;
}
}
void add_timestamp(const struct encoder_ctx *context, LLONG time, const bool disassemble)
{
write_wrapped(context->out->fh, context->encoded_crlf, context->encoded_crlf_length);
@@ -528,9 +852,15 @@ void add_timestamp(const struct encoder_ctx *context, LLONG time, const bool dis
unsigned hour, minute, second, milli;
millis_to_time(time, &hour, &minute, &second, &milli);
// SMPTE format
float frame = milli * 29.97 / 1000;
fdprintf(context->out->fh, "%02u:%02u:%02u:%02.f\t", hour, minute, second, frame);
// SMPTE format - use configurable frame rate (issue #1191)
float fps = get_scc_fps(context->scc_framerate);
// Calculate frame number from milliseconds, ensuring it stays in valid range 0 to fps-1
// Use floor to avoid rounding up to fps (e.g., 29.97 -> 30 is invalid)
int max_frames = (int)fps;
int frame = (int)(milli * fps / 1000.0f);
if (frame >= max_frames)
frame = max_frames - 1; // Cap at max valid frame (e.g., 29 for 29.97fps)
fdprintf(context->out->fh, "%02u:%02u:%02u:%02d\t", hour, minute, second, frame);
}
void clear_screen(const struct encoder_ctx *context, LLONG end_time, const unsigned char channel, const bool disassemble)
@@ -550,8 +880,51 @@ int write_cc_buffer_as_scenarist(const struct eia608_screen *data, struct encode
unsigned char current_row = UINT8_MAX;
unsigned char current_column = UINT8_MAX;
// 1. Load the caption
add_timestamp(context, data->start_time, disassemble);
// Timing variables for accurate timing mode (issue #1120)
LLONG actual_start_time = data->start_time; // When caption should display
LLONG actual_end_time = data->end_time; // When caption should clear
LLONG preroll_start = data->start_time; // When to start loading (default: same as display)
float fps = get_scc_fps_internal(context->scc_framerate);
bool use_separate_display_time = false; // Whether to write EOC at separate timestamp
// If accurate timing is enabled, calculate pre-roll and handle collisions
if (context->scc_accurate_timing)
{
// Calculate total bytes needed for this caption
unsigned int total_bytes = calculate_caption_bytes(data);
// Calculate when we need to start loading
preroll_start = calculate_preroll_time(actual_start_time, total_bytes, fps);
// Check for collisions with previous caption and resolve
if (resolve_collision(context, &preroll_start, &actual_start_time, fps))
{
// Timing was adjusted due to collision
// Also adjust end time by the same amount
LLONG shift = actual_start_time - data->start_time;
actual_end_time = data->end_time + shift;
}
// Update timing state for next caption
float ms_per_frame = 1000.0f / fps;
unsigned int frames_needed = (total_bytes + 1) / 2;
LLONG transmission_time_ms = (LLONG)(frames_needed * ms_per_frame);
context->scc_last_transmission_end = preroll_start + transmission_time_ms;
context->scc_last_display_end = actual_end_time;
// Enable separate display timing (like scc_tools)
use_separate_display_time = true;
// 1. Load the caption at pre-roll time
add_timestamp(context, preroll_start, disassemble);
}
else
{
// Legacy mode: use original timing
// 1. Load the caption
add_timestamp(context, data->start_time, disassemble);
}
write_control_code(context->out->fh, data->channel, RCL, disassemble, &bytes_written);
for (uint8_t row = 0; row < 15; ++row)
{
@@ -578,6 +951,23 @@ int write_cc_buffer_as_scenarist(const struct eia608_screen *data, struct encode
{
if (switch_font || switch_color)
{
// Optimization (issue #1191): Use styled PAC when at column 0 with non-default style
// This avoids needing a separate mid-row code
if (column == 0 && can_use_styled_pac(data->colors[row][column], data->fonts[row][column], 0))
{
write_styled_preamble(context->out->fh, data->channel, row,
data->colors[row][column], data->fonts[row][column],
disassemble, &bytes_written);
current_row = row;
current_column = 0;
current_font = data->fonts[row][column];
current_color = data->colors[row][column];
// Write the character and continue
write_character(context->out->fh, data->characters[row][column], disassemble, &bytes_written);
++current_column;
continue;
}
if (data->characters[row][column] == ' ')
{
// The MID-ROW code is going to move the cursor to the
@@ -617,12 +1007,26 @@ int write_cc_buffer_as_scenarist(const struct eia608_screen *data, struct encode
check_padding(context->out->fh, disassemble, &bytes_written);
}
// 2. Show the caption
// 2. Show the caption (EOC = End of Caption, makes it visible)
if (use_separate_display_time)
{
// For accurate timing: write display command at actual display time
// This matches scc_tools behavior where load and display are separate
add_timestamp(context, actual_start_time, disassemble);
}
write_control_code(context->out->fh, data->channel, EOC, disassemble, &bytes_written);
write_control_code(context->out->fh, data->channel, ENM, disassemble, &bytes_written);
// 3. Clear the caption
clear_screen(context, data->end_time, data->channel, disassemble);
// 3. Clear the caption at the end time
// In accurate timing mode, skip clear - the next caption's EOC will handle the transition
// This matches scc_tools behavior which doesn't write EDM between consecutive captions
if (!use_separate_display_time)
{
// Legacy mode: always write clear
clear_screen(context, actual_end_time, data->channel, disassemble);
}
// In accurate timing mode, scc_last_display_end is still tracked for reference
// but we don't write the clear command to avoid out-of-order timestamps
return 1;
}

View File

@@ -251,6 +251,9 @@ void set_spupng_offset(void *ctx, int x, int y)
sp->xOffset = x;
sp->yOffset = y;
}
// Forward declaration for calculate_spupng_offsets
static void calculate_spupng_offsets(struct spupng_t *sp, struct encoder_ctx *ctx);
int save_spupng(const char *filename, uint8_t *bitmap, int w, int h,
png_color *palette, png_byte *alpha, int nb_color)
{
@@ -384,7 +387,7 @@ int write_cc_bitmap_as_spupng(struct cc_subtitle *sub, struct encoder_ctx *conte
struct cc_bitmap *rect;
png_color *palette = NULL;
png_byte *alpha = NULL;
int wrote_opentag = 1;
int wrote_opentag = 0; // Track if we actually wrote the tag
x_pos = -1;
y_pos = -1;
@@ -395,13 +398,11 @@ int write_cc_bitmap_as_spupng(struct cc_subtitle *sub, struct encoder_ctx *conte
return 0;
inc_spupng_fileindex(sp);
write_sputag_open(sp, sub->start_time, sub->end_time - 1);
if (sub->nb_data == 0 && (sub->flags & SUB_EOD_MARKER))
{
context->prev_start = -1;
if (wrote_opentag)
write_sputag_close(sp);
// No subtitle data, skip writing
return 0;
}
rect = sub->data;
@@ -440,7 +441,13 @@ int write_cc_bitmap_as_spupng(struct cc_subtitle *sub, struct encoder_ctx *conte
}
}
filename = get_spupng_filename(sp);
set_spupng_offset(sp, x_pos, y_pos);
// Set image dimensions for offset calculation
sp->img_w = width;
sp->img_h = height;
// Calculate centered offsets based on screen size (PAL/NTSC)
calculate_spupng_offsets(sp, context);
if (sub->flags & SUB_EOD_MARKER)
context->prev_start = sub->start_time;
pbuf = (uint8_t *)malloc(width * height);
@@ -475,6 +482,15 @@ int write_cc_bitmap_as_spupng(struct cc_subtitle *sub, struct encoder_ctx *conte
/* TODO do rectangle wise, one color table should not be used for all rectangles */
mapclut_paletee(palette, alpha, (uint32_t *)rect[0].data1, rect[0].nb_colors);
// Save PNG file first
save_spupng(filename, pbuf, width, height, palette, alpha, rect[0].nb_colors);
freep(&pbuf);
// Write XML tag with calculated centered offsets
write_sputag_open(sp, sub->start_time, sub->end_time - 1);
wrote_opentag = 1; // Mark that we wrote the tag
#ifdef ENABLE_OCR
if (!context->nospupngocr)
{
@@ -487,8 +503,6 @@ int write_cc_bitmap_as_spupng(struct cc_subtitle *sub, struct encoder_ctx *conte
}
}
#endif
save_spupng(filename, pbuf, width, height, palette, alpha, rect[0].nb_colors);
freep(&pbuf);
end:
if (wrote_opentag)
@@ -991,6 +1005,8 @@ int spupng_export_string2png(struct spupng_t *sp, char *str, FILE *output)
*/
// Save image
sp->img_w = canvas_width;
sp->img_h = canvas_height;
write_image(buffer, output, canvas_width, canvas_height);
free(tmp);
free(buffer);
@@ -1081,6 +1097,28 @@ int eia608_to_str(struct encoder_ctx *context, struct eia608_screen *data, char
// string needs to be in UTF-8 encoding.
// This function will take care of encoding.
static void calculate_spupng_offsets(struct spupng_t *sp, struct encoder_ctx *ctx)
{
int screen_w = 720;
int screen_h;
/* Teletext is always PAL */
if (ctx->in_fileformat == 2 || ctx->is_pal)
{
screen_h = 576;
}
else
{
screen_h = 480;
}
sp->xOffset = (screen_w - sp->img_w) / 2;
sp->yOffset = (screen_h - sp->img_h) / 2;
// SPU / DVD requires even yOffset (interlacing)
if (sp->yOffset & 1)
sp->yOffset++;
}
int spupng_write_string(struct spupng_t *sp, char *string, LLONG start_time, LLONG end_time,
struct encoder_ctx *context)
{
@@ -1099,6 +1137,7 @@ int spupng_write_string(struct spupng_t *sp, char *string, LLONG start_time, LLO
}
// free(string_utf32);
fclose(sp->fppng);
calculate_spupng_offsets(sp, context);
write_sputag_open(sp, start_time, end_time);
write_spucomment(sp, string);
write_sputag_close(sp);

View File

@@ -39,6 +39,8 @@ struct spupng_t
int fileIndex;
int xOffset;
int yOffset;
int img_w;
int img_h;
};
#endif

View File

@@ -182,6 +182,7 @@ typedef struct DVBSubContext
LLONG time_out;
#ifdef ENABLE_OCR
void *ocr_ctx;
int ocr_initialized; // Flag to track if OCR has been lazily initialized
#endif
DVBSubRegion *region_list;
DVBSubCLUT *clut_list;
@@ -418,7 +419,7 @@ static void delete_regions(DVBSubContext *ctx)
* @return DVB context kept as void* for abstraction
*
*/
void *dvbsub_init_decoder(struct dvb_config *cfg, int initialized_ocr)
void *dvbsub_init_decoder(struct dvb_config *cfg)
{
int i, r, g, b, a = 0;
DVBSubContext *ctx = (DVBSubContext *)malloc(sizeof(DVBSubContext));
@@ -442,8 +443,11 @@ void *dvbsub_init_decoder(struct dvb_config *cfg, int initialized_ocr)
}
#ifdef ENABLE_OCR
if (!initialized_ocr)
ctx->ocr_ctx = init_ocr(ctx->lang_index);
// Lazy OCR initialization: don't init here, wait until a bitmap actually needs OCR
// This avoids ~10 second Tesseract startup overhead for files that have DVB streams
// but don't actually produce any bitmap subtitles (e.g., files with CEA-608 captions)
ctx->ocr_ctx = NULL;
ctx->ocr_initialized = 0;
#endif
ctx->version = -1;
@@ -1702,7 +1706,13 @@ static int write_dvb_sub(struct lib_cc_decode *dec_ctx, struct cc_subtitle *sub)
// Perform OCR
#ifdef ENABLE_OCR
char *ocr_str = NULL;
if (ctx->ocr_ctx)
// Lazy OCR initialization: only init when we actually have a bitmap to process
if (!ctx->ocr_initialized)
{
ctx->ocr_ctx = init_ocr(ctx->lang_index);
ctx->ocr_initialized = 1; // Mark as initialized even if init_ocr returns NULL
}
if (ctx->ocr_ctx && region)
{
int ret = ocr_rect(ctx->ocr_ctx, rect, &ocr_str, region->bgcolor, dec_ctx->ocr_quantmode);
if (ret >= 0)

View File

@@ -42,7 +42,7 @@ extern "C"
* @return DVB context kept as void* for abstraction
*
*/
void *dvbsub_init_decoder(struct dvb_config *cfg, int initialized_ocr);
void *dvbsub_init_decoder(struct dvb_config *cfg);
int dvbsub_close_decoder(void **dvb_ctx);

View File

@@ -142,7 +142,7 @@ int user_data(struct encoder_ctx *enc_ctx, struct lib_cc_decode *dec_ctx, struct
{
if ((ud_header[1] & 0x7F) == 0x01)
{
unsigned char cc_data[3 * 31 + 1]; // Maximum cc_count is 31
unsigned char cc_data[3 * 32]; // Increased for safety margin, 31 is max count
dec_ctx->stat_scte20ccheaders++;
read_bytes(ustream, 2); // "03 01"
@@ -370,6 +370,7 @@ int user_data(struct encoder_ctx *enc_ctx, struct lib_cc_decode *dec_ctx, struct
dbg_print(CCX_DMT_PARSE, "%s", debug_608_to_ASC(dishdata, 0));
dbg_print(CCX_DMT_PARSE, "%s:\n", debug_608_to_ASC(dishdata + 3, 0));
dishdata[cc_count * 3] = 0xFF; // Ensure termination for store_hdcc
store_hdcc(enc_ctx, dec_ctx, dishdata, cc_count, dec_ctx->timing->current_tref, dec_ctx->timing->fts_now, sub);
// Ignore 4 (0x020A, followed by two unknown) bytes.
@@ -484,7 +485,10 @@ int user_data(struct encoder_ctx *enc_ctx, struct lib_cc_decode *dec_ctx, struct
mprint("MPEG:VBI: only support Luma line\n");
if (udatalen < 720)
mprint("MPEG:VBI: Minimum 720 bytes in luma line required\n");
{
mprint("MPEG:VBI: Minimum 720 bytes in luma line required, skipping truncated packet.\n");
return 1;
}
decode_vbi(dec_ctx, field, ustream->pos, 720, sub);
dbg_print(CCX_DMT_VERBOSE, "GXF (vbi line %d) user data:\n", line_nb);

View File

@@ -66,6 +66,7 @@ void prepare_for_new_file(struct lib_ccx_ctx *ctx)
{
// Init per file variables
ctx->last_reported_progress = -1;
ctx->min_global_timestamp_offset = -1; // -1 means not yet initialized
ctx->stat_numuserheaders = 0;
ctx->stat_dvdccheaders = 0;
ctx->stat_scte20ccheaders = 0;

View File

@@ -18,6 +18,7 @@
#include "ccx_gxf.h"
#include "dvd_subtitle_decoder.h"
#include "ccx_demuxer_mxf.h"
#include "ccx_dtvcc.h"
int end_of_file = 0; // End of file?
@@ -75,7 +76,7 @@ int ps_get_more_data(struct lib_ccx_ctx *ctx, struct demuxer_data **ppdata)
if (!ctx->demux_ctx->strangeheader)
{
mprint("\nNot a recognized header. Searching for next header.\n");
dump(CCX_DMT_GENERIC_NOTICES, nextheader, 6, 0, 0);
dump(CCX_DMT_PARSE, nextheader, 6, 0, 0);
// Only print the message once per loop / unrecognized header
ctx->demux_ctx->strangeheader = 1;
}
@@ -566,6 +567,104 @@ static size_t process_raw_for_mcc(struct encoder_ctx *enc_ctx, struct lib_cc_dec
}
// Raw file process
// Parse raw CDP (Caption Distribution Packet) data
// Returns number of bytes processed
static size_t process_raw_cdp(struct encoder_ctx *enc_ctx, struct lib_cc_decode *dec_ctx,
struct cc_subtitle *sub, unsigned char *buffer, size_t len)
{
size_t pos = 0;
int cdp_count = 0;
while (pos + 10 < len) // Minimum CDP size
{
// Check for CDP identifier
if (buffer[pos] != 0x96 || buffer[pos + 1] != 0x69)
{
pos++;
continue;
}
unsigned char cdp_length = buffer[pos + 2];
if (pos + cdp_length > len)
break; // Incomplete CDP packet
unsigned char framerate_byte = buffer[pos + 3];
int framerate_code = framerate_byte >> 4;
// Skip to find cc_data section (0x72)
size_t cdp_pos = pos + 4; // After identifier, length, framerate
int cc_count = 0;
unsigned char *cc_data = NULL;
// Skip header sequence counter (2 bytes)
cdp_pos += 2;
// Look for cc_data section (0x72) within CDP
while (cdp_pos < pos + cdp_length - 4)
{
if (buffer[cdp_pos] == 0x72) // cc_data section
{
cc_count = buffer[cdp_pos + 1] & 0x1F;
cc_data = buffer + cdp_pos + 2;
break;
}
else if (buffer[cdp_pos] == 0x71) // time code section
{
cdp_pos += 5; // Skip time code section
}
else if (buffer[cdp_pos] == 0x73) // service info section
{
break; // Past cc_data
}
else if (buffer[cdp_pos] == 0x74) // footer
{
break;
}
else
{
cdp_pos++;
}
}
if (cc_count > 0 && cc_data != NULL)
{
// Calculate PTS based on CDP frame count and frame rate
static const int fps_table[] = {0, 24, 24, 25, 30, 30, 50, 60, 60};
int fps = (framerate_code < 9) ? fps_table[framerate_code] : 30;
LLONG pts = (LLONG)cdp_count * 90000 / fps;
// Set timing if not already set
if (dec_ctx->timing->pts_set == 0)
{
dec_ctx->timing->min_pts = pts;
dec_ctx->timing->pts_set = 2;
dec_ctx->timing->sync_pts = pts;
}
set_current_pts(dec_ctx->timing, pts);
set_fts(dec_ctx->timing);
#ifndef DISABLE_RUST
// Enable DTVCC decoder for CEA-708 captions
if (dec_ctx->dtvcc_rust)
{
int is_active = ccxr_dtvcc_is_active(dec_ctx->dtvcc_rust);
if (!is_active)
{
ccxr_dtvcc_set_active(dec_ctx->dtvcc_rust, 1);
}
}
#endif
// Process cc_data triplets through process_cc_data for 708 support
process_cc_data(enc_ctx, dec_ctx, cc_data, cc_count, sub);
cdp_count++;
}
pos += cdp_length;
}
return pos;
}
int raw_loop(struct lib_ccx_ctx *ctx)
{
LLONG ret;
@@ -575,6 +674,8 @@ int raw_loop(struct lib_ccx_ctx *ctx)
struct lib_cc_decode *dec_ctx = NULL;
int caps = 0;
int is_dvdraw = 0; // Flag to track if this is DVD raw format
int is_scc = 0; // Flag to track if this is SCC format
int is_cdp = 0; // Flag to track if this is raw CDP format
int is_mcc_output = 0; // Flag for MCC output format
dec_ctx = update_decoder_list(ctx);
@@ -607,13 +708,28 @@ int raw_loop(struct lib_ccx_ctx *ctx)
break;
// Check if this is DVD raw format using Rust detection
if (!is_dvdraw && ccxr_is_dvdraw_header(data->buffer, (unsigned int)data->len))
if (!is_dvdraw && !is_scc && ccxr_is_dvdraw_header(data->buffer, (unsigned int)data->len))
{
is_dvdraw = 1;
mprint("Detected McPoodle's DVD raw format\n");
}
if (is_mcc_output && !is_dvdraw)
// Check if this is SCC format using Rust detection
if (!is_scc && !is_dvdraw && ccxr_is_scc_file(data->buffer, (unsigned int)data->len))
{
is_scc = 1;
mprint("Detected SCC (Scenarist Closed Caption) format\n");
}
// Check if this is raw CDP format (starts with 0x9669)
if (!is_cdp && !is_scc && !is_dvdraw && data->len >= 2 &&
data->buffer[0] == 0x96 && data->buffer[1] == 0x69)
{
is_cdp = 1;
mprint("Detected raw CDP (Caption Distribution Packet) format\n");
}
if (is_mcc_output && !is_dvdraw && !is_scc && !is_cdp)
{
// For MCC output, encode raw data directly without decoding
// This preserves the original CEA-608 byte pairs in CDP format
@@ -626,6 +742,18 @@ int raw_loop(struct lib_ccx_ctx *ctx)
// Use Rust implementation - handles timing internally
ret = ccxr_process_dvdraw(dec_ctx, dec_sub, data->buffer, (unsigned int)data->len);
}
else if (is_scc)
{
// Use Rust SCC implementation - handles timing internally via SMPTE timecodes
ret = ccxr_process_scc(dec_ctx, dec_sub, data->buffer, (unsigned int)data->len, ccx_options.scc_framerate);
}
else if (is_cdp)
{
// Process raw CDP packets (e.g., from SDI VANC capture)
ret = process_raw_cdp(enc_ctx, dec_ctx, dec_sub, data->buffer, data->len);
if (ret > 0)
caps = 1;
}
else
{
ret = process_raw(dec_ctx, dec_sub, data->buffer, data->len);
@@ -796,10 +924,6 @@ int process_data(struct encoder_ctx *enc_ctx, struct lib_cc_decode *dec_ctx, str
got = data_node->len;
}
}
else if (data_node->bufferdatatype == CCX_PRIVATE_MPEG2_CC)
{
got = data_node->len; // Do nothing. Still don't know how to process it
}
else if (data_node->bufferdatatype == CCX_RAW) // Raw two byte 608 data from DVR-MS/ASF
{
// The asf_get_more_data() loop sets current_pts when possible
@@ -852,7 +976,34 @@ int process_data(struct encoder_ctx *enc_ctx, struct lib_cc_decode *dec_ctx, str
}
else if (data_node->bufferdatatype == CCX_RAW_TYPE)
{
got = process_raw_with_field(dec_ctx, dec_sub, data_node->buffer, data_node->len);
// CCX_RAW_TYPE contains cc_data triplets (cc_type + 2 data bytes each)
// Used by MXF and GXF demuxers
// Initialize timing if not set (use caption PTS as reference)
if (dec_ctx->timing->pts_set == 0 && data_node->pts != CCX_NOPTS)
{
dec_ctx->timing->min_pts = data_node->pts;
dec_ctx->timing->pts_set = 2; // MinPtsSet
dec_ctx->timing->sync_pts = data_node->pts;
set_fts(dec_ctx->timing);
}
#ifndef DISABLE_RUST
// Enable DTVCC decoder for CEA-708 captions from MXF/GXF
if (dec_ctx->dtvcc_rust)
{
int is_active = ccxr_dtvcc_is_active(dec_ctx->dtvcc_rust);
if (!is_active)
{
ccxr_dtvcc_set_active(dec_ctx->dtvcc_rust, 1);
}
}
#endif
// Use process_cc_data to properly invoke DTVCC decoder for 708 captions
int cc_count = data_node->len / 3;
process_cc_data(enc_ctx, dec_ctx, data_node->buffer, cc_count, dec_sub);
got = data_node->len;
}
else if (data_node->bufferdatatype == CCX_ISDB_SUBTITLE)
{
@@ -1041,7 +1192,11 @@ int process_non_multiprogram_general_loop(struct lib_ccx_ctx *ctx,
cinfo = get_cinfo(ctx->demux_ctx, pid);
*enc_ctx = update_encoder_list_cinfo(ctx, cinfo);
*dec_ctx = update_decoder_list_cinfo(ctx, cinfo);
#ifndef DISABLE_RUST
ccxr_dtvcc_set_encoder((*dec_ctx)->dtvcc_rust, *enc_ctx);
#else
(*dec_ctx)->dtvcc->encoder = (void *)(*enc_ctx);
#endif
if ((*dec_ctx)->timing->min_pts == 0x01FFFFFFFFLL) // if we didn't set the min_pts of the program
{
@@ -1265,7 +1420,11 @@ int general_loop(struct lib_ccx_ctx *ctx)
enc_ctx = update_encoder_list_cinfo(ctx, cinfo);
dec_ctx = update_decoder_list_cinfo(ctx, cinfo);
#ifndef DISABLE_RUST
ccxr_dtvcc_set_encoder(dec_ctx->dtvcc_rust, enc_ctx);
#else
dec_ctx->dtvcc->encoder = (void *)enc_ctx; // WARN: otherwise cea-708 will not work
#endif
if (dec_ctx->timing->min_pts == 0x01FFFFFFFFLL) // if we didn't set the min_pts of the program
{
@@ -1349,7 +1508,24 @@ int general_loop(struct lib_ccx_ctx *ctx)
}
if (ctx->live_stream)
{
int cur_sec = (int)(get_fts(dec_ctx->timing, dec_ctx->current_field) / 1000);
LLONG t = get_fts(dec_ctx->timing, dec_ctx->current_field);
if (!t && ctx->demux_ctx->global_timestamp_inited)
t = ctx->demux_ctx->global_timestamp - ctx->demux_ctx->min_global_timestamp;
// Handle multi-program TS timing
if (ctx->demux_ctx->global_timestamp_inited)
{
LLONG offset = ctx->demux_ctx->global_timestamp - ctx->demux_ctx->min_global_timestamp;
if (ctx->min_global_timestamp_offset < 0 || offset < ctx->min_global_timestamp_offset)
ctx->min_global_timestamp_offset = offset;
// Only use timestamps from the program with the lowest base
if (offset - ctx->min_global_timestamp_offset < 60000)
t = offset - ctx->min_global_timestamp_offset;
else
t = ctx->min_global_timestamp_offset > 0 ? 0 : t;
if (t < 0)
t = 0;
}
int cur_sec = (int)(t / 1000);
int th = cur_sec / 10;
if (ctx->last_reported_progress != th)
{
@@ -1367,6 +1543,28 @@ int general_loop(struct lib_ccx_ctx *ctx)
LLONG t = get_fts(dec_ctx->timing, dec_ctx->current_field);
if (!t && ctx->demux_ctx->global_timestamp_inited)
t = ctx->demux_ctx->global_timestamp - ctx->demux_ctx->min_global_timestamp;
// For multi-program TS files, different programs can have different
// PCR bases (e.g., one at 25h, another at 23h). This causes the
// global_timestamp to jump between different bases, resulting in
// wildly different offset values. Track the minimum offset seen
// and only display times from the program with the lowest base.
if (ctx->demux_ctx->global_timestamp_inited)
{
LLONG offset = ctx->demux_ctx->global_timestamp - ctx->demux_ctx->min_global_timestamp;
// Track minimum offset (this is the PCR base of the program
// with the lowest timestamp, which represents true file time)
if (ctx->min_global_timestamp_offset < 0 || offset < ctx->min_global_timestamp_offset)
ctx->min_global_timestamp_offset = offset;
// Only use timestamps from the program with the lowest base.
// If current offset is significantly larger than minimum (by > 60s),
// it's from a program with a higher PCR base - use minimum instead.
if (offset - ctx->min_global_timestamp_offset < 60000)
t = offset - ctx->min_global_timestamp_offset;
else
t = ctx->min_global_timestamp_offset > 0 ? 0 : t; // fallback to minimum-based time
if (t < 0)
t = 0;
}
int cur_sec = (int)(t / 1000);
activity_progress(progress, cur_sec / 60, cur_sec % 60);
ctx->last_reported_progress = progress;
@@ -1475,7 +1673,11 @@ int rcwt_loop(struct lib_ccx_ctx *ctx)
}
dec_ctx = update_decoder_list(ctx);
#ifndef DISABLE_RUST
ccxr_dtvcc_set_encoder(dec_ctx->dtvcc_rust, enc_ctx);
#else
dec_ctx->dtvcc->encoder = (void *)enc_ctx; // WARN: otherwise cea-708 will not work
#endif
if (parsebuf[6] == 0 && parsebuf[7] == 2)
{
dec_ctx->codec = CCX_CODEC_TELETEXT;

View File

@@ -1,7 +1,7 @@
#ifndef CCX_CCEXTRACTOR_H
#define CCX_CCEXTRACTOR_H
#define VERSION "0.96"
#define VERSION "0.96.5"
// Load common includes and constants for library usage
#include "ccx_common_platform.h"
@@ -43,7 +43,7 @@ struct file_report
};
// Stuff for telxcc.c
#define MAX_TLT_PAGES_EXTRACT 8 // Maximum number of teletext pages to extract simultaneously
#define MAX_TLT_PAGES_EXTRACT 8 // Maximum number of teletext pages to extract simultaneously
struct ccx_s_teletext_config
{
@@ -55,11 +55,11 @@ struct ccx_s_teletext_config
uint8_t nonempty : 1; // produce at least one (dummy) frame
// uint8_t se_mode : 1; // search engine compatible mode => Uses CCExtractor's write_format
// uint64_t utc_refvalue; // UTC referential value => Moved to ccx_decoders_common, so can be used for other decoders (608/xds) too
uint16_t user_page; // Page selected by user (legacy, first page)
uint16_t user_page; // Page selected by user (legacy, first page)
// Multi-page teletext extraction (issue #665)
uint16_t user_pages[MAX_TLT_PAGES_EXTRACT]; // Pages selected by user for extraction
int num_user_pages; // Number of pages to extract (0 = auto-detect single page)
int extract_all_pages; // If 1, extract all detected subtitle pages
uint16_t user_pages[MAX_TLT_PAGES_EXTRACT]; // Pages selected by user for extraction
int num_user_pages; // Number of pages to extract (0 = auto-detect single page)
int extract_all_pages; // If 1, extract all detected subtitle pages
int dolevdist; // 0=Don't attempt to correct errors
int levdistmincnt, levdistmaxpct; // Means 2 fails or less is "the same", 10% or less is also "the same"
struct ccx_boundary_time extraction_start, extraction_end; // Segment we actually process
@@ -90,6 +90,7 @@ struct lib_ccx_ctx
LLONG total_past; // Only in binary concat mode
int last_reported_progress;
LLONG min_global_timestamp_offset; // Track minimum (global - min) for multi-program TS
/* Stats */
int stat_numuserheaders;
@@ -160,6 +161,7 @@ struct lib_ccx_ctx *init_libraries(struct ccx_s_options *opt);
void dinit_libraries(struct lib_ccx_ctx **ctx);
extern void ccxr_init_basic_logger();
extern void ccxr_update_logger_target();
// ccextractor.c
void print_end_msg(void);
@@ -183,6 +185,10 @@ size_t process_raw(struct lib_cc_decode *ctx, struct cc_subtitle *sub, unsigned
unsigned int ccxr_process_dvdraw(struct lib_cc_decode *ctx, struct cc_subtitle *sub, const unsigned char *buffer, unsigned int len);
int ccxr_is_dvdraw_header(const unsigned char *buffer, unsigned int len);
// Rust FFI: SCC (Scenarist Closed Caption) format processing (see src/rust/src/demuxer/scc.rs)
unsigned int ccxr_process_scc(struct lib_cc_decode *ctx, struct cc_subtitle *sub, const unsigned char *buffer, unsigned int len, int framerate);
int ccxr_is_scc_file(const unsigned char *buffer, unsigned int len);
int general_loop(struct lib_ccx_ctx *ctx);
void process_hex(struct lib_ccx_ctx *ctx, char *filename);
int rcwt_loop(struct lib_ccx_ctx *ctx);
@@ -337,4 +343,9 @@ int process_non_multiprogram_general_loop(struct lib_ccx_ctx *ctx,
void segment_output_file(struct lib_ccx_ctx *ctx, struct lib_cc_decode *dec_ctx);
int decode_vbi(struct lib_cc_decode *dec_ctx, uint8_t field, unsigned char *buffer, size_t len, struct cc_subtitle *sub);
#ifndef DISABLE_RUST
// Rust FFI function to set encoder on persistent CEA-708 decoder
void ccxr_dtvcc_set_encoder(void *dtvcc_rust, struct encoder_ctx *encoder);
#endif
#endif

View File

@@ -6,6 +6,7 @@
#include <limits.h>
#include <assert.h>
#include "dvb_subtitle_decoder.h"
#include "vobsub_decoder.h"
void skip_bytes(FILE *file, ULLONG n)
{
@@ -121,6 +122,8 @@ void parse_ebml(FILE *file)
{
code <<= 8;
code += mkv_read_byte(file);
if (feof(file))
break;
code_len++;
switch (code)
@@ -185,6 +188,8 @@ void parse_segment_info(FILE *file)
{
code <<= 8;
code += mkv_read_byte(file);
if (feof(file))
break;
code_len++;
switch (code)
@@ -483,6 +488,8 @@ void parse_segment_cluster_block_group(struct matroska_ctx *mkv_ctx, ULLONG clus
{
code <<= 8;
code += mkv_read_byte(file);
if (feof(file))
break;
code_len++;
switch (code)
@@ -611,6 +618,8 @@ void parse_segment_cluster(struct matroska_ctx *mkv_ctx)
{
code <<= 8;
code += mkv_read_byte(file);
if (feof(file))
break;
code_len++;
switch (code)
@@ -733,14 +742,24 @@ int process_avc_frame_mkv(struct matroska_ctx *mkv_ctx, struct matroska_avc_fram
{
uint32_t nal_length;
nal_length = bswap32(*(long *)&frame.data[i]);
if (i + nal_unit_size > frame.len)
break;
nal_length =
((uint32_t)frame.data[i] << 24) |
((uint32_t)frame.data[i + 1] << 16) |
((uint32_t)frame.data[i + 2] << 8) |
(uint32_t)frame.data[i + 3];
i += nal_unit_size;
if (nal_length > frame.len - i)
break;
if (nal_length > 0)
do_NAL(enc_ctx, dec_ctx, (unsigned char *)&(frame.data[i]), nal_length, &mkv_ctx->dec_sub);
do_NAL(enc_ctx, dec_ctx, (unsigned char *)&frame.data[i], nal_length, &mkv_ctx->dec_sub);
i += nal_length;
} // outer for
assert(i == frame.len);
mkv_ctx->current_second = (int)(get_fts(dec_ctx->timing, dec_ctx->current_field) / 1000);
@@ -768,11 +787,22 @@ int process_hevc_frame_mkv(struct matroska_ctx *mkv_ctx, struct matroska_avc_fra
{
uint32_t nal_length;
nal_length = bswap32(*(long *)&frame.data[i]);
if (i + nal_unit_size > frame.len)
break;
nal_length =
((uint32_t)frame.data[i] << 24) |
((uint32_t)frame.data[i + 1] << 16) |
((uint32_t)frame.data[i + 2] << 8) |
(uint32_t)frame.data[i + 3];
i += nal_unit_size;
if (nal_length > frame.len - i)
break;
if (nal_length > 0)
do_NAL(enc_ctx, dec_ctx, (unsigned char *)&(frame.data[i]), nal_length, &mkv_ctx->dec_sub);
do_NAL(enc_ctx, dec_ctx, (unsigned char *)&frame.data[i], nal_length, &mkv_ctx->dec_sub);
i += nal_length;
}
@@ -844,6 +874,8 @@ void parse_segment_track_entry(struct matroska_ctx *mkv_ctx)
{
code <<= 8;
code += mkv_read_byte(file);
if (feof(file))
break;
code_len++;
switch (code)
@@ -1172,7 +1204,7 @@ void parse_private_codec_data(struct matroska_ctx *mkv_ctx, char *codec_id_strin
memset((void *)&cnf, 0, sizeof(struct dvb_config));
parse_dvb_description(&cnf, codec_data, 8);
dec_ctx->private_data = dvbsub_init_decoder(&cnf, 0);
dec_ctx->private_data = dvbsub_init_decoder(&cnf);
free(codec_data);
}
@@ -1196,6 +1228,8 @@ void parse_segment_tracks(struct matroska_ctx *mkv_ctx)
{
code <<= 8;
code += mkv_read_byte(file);
if (feof(file))
break;
code_len++;
switch (code)
@@ -1240,6 +1274,8 @@ void parse_segment(struct matroska_ctx *mkv_ctx)
{
code <<= 8;
code += mkv_read_byte(file);
if (feof(file))
break;
code_len++;
switch (code)
{
@@ -1334,11 +1370,362 @@ char *ass_ssa_sentence_erase_read_order(char *text)
return buf;
}
/* VOBSUB support: Generate PS Pack header
* The PS Pack header is 14 bytes:
* - 4 bytes: start code (00 00 01 ba)
* - 6 bytes: SCR (System Clock Reference) in MPEG-2 format
* - 3 bytes: mux rate
* - 1 byte: stuffing length (0)
*/
static void generate_ps_pack_header(unsigned char *buf, ULLONG pts_90khz)
{
// PS Pack start code
buf[0] = 0x00;
buf[1] = 0x00;
buf[2] = 0x01;
buf[3] = 0xBA;
// SCR (System Clock Reference) - use PTS as SCR base, SCR extension = 0
// MPEG-2 format: 01 SCR[32:30] 1 SCR[29:15] 1 SCR[14:0] 1 SCR_ext[8:0] 1
ULLONG scr = pts_90khz;
ULLONG scr_base = scr;
int scr_ext = 0;
buf[4] = 0x44 | ((scr_base >> 27) & 0x38) | ((scr_base >> 28) & 0x03);
buf[5] = (scr_base >> 20) & 0xFF;
buf[6] = 0x04 | ((scr_base >> 12) & 0xF8) | ((scr_base >> 13) & 0x03);
buf[7] = (scr_base >> 5) & 0xFF;
buf[8] = 0x04 | ((scr_base << 3) & 0xF8) | ((scr_ext >> 7) & 0x03);
buf[9] = ((scr_ext << 1) & 0xFE) | 0x01;
// Mux rate (10080 = standard DVD rate)
int mux_rate = 10080;
buf[10] = (mux_rate >> 14) & 0xFF;
buf[11] = (mux_rate >> 6) & 0xFF;
buf[12] = ((mux_rate << 2) & 0xFC) | 0x03;
// Stuffing length = 0, with marker bits
buf[13] = 0xF8;
}
/* VOBSUB support: Generate PES header for private stream 1
* Returns the total header size (variable based on PTS)
*/
static int generate_pes_header(unsigned char *buf, ULLONG pts_90khz, int payload_size, int stream_id)
{
// PES start code for private stream 1
buf[0] = 0x00;
buf[1] = 0x00;
buf[2] = 0x01;
buf[3] = 0xBD; // Private stream 1
// PES packet length = header data (3 + 5 for PTS) + 1 (substream ID) + payload
int pes_header_data_len = 5; // PTS only
int pes_packet_len = 3 + pes_header_data_len + 1 + payload_size;
buf[4] = (pes_packet_len >> 8) & 0xFF;
buf[5] = pes_packet_len & 0xFF;
// PES flags: MPEG-2, original
buf[6] = 0x81;
// PTS_DTS_flags = 10 (PTS only)
buf[7] = 0x80;
// PES header data length
buf[8] = pes_header_data_len;
// PTS (5 bytes): '0010' | PTS[32:30] | '1' | PTS[29:15] | '1' | PTS[14:0] | '1'
buf[9] = 0x21 | ((pts_90khz >> 29) & 0x0E);
buf[10] = (pts_90khz >> 22) & 0xFF;
buf[11] = 0x01 | ((pts_90khz >> 14) & 0xFE);
buf[12] = (pts_90khz >> 7) & 0xFF;
buf[13] = 0x01 | ((pts_90khz << 1) & 0xFE);
// Substream ID (0x20 = first VOBSUB stream)
buf[14] = 0x20 + stream_id;
return 15; // Total PES header size
}
/* VOBSUB support: Generate timestamp string for .idx file
* Format: HH:MM:SS:mmm (where mmm is milliseconds)
*/
static void generate_vobsub_timestamp(char *buf, size_t bufsize, ULLONG milliseconds)
{
ULLONG ms = milliseconds % 1000;
milliseconds /= 1000;
ULLONG seconds = milliseconds % 60;
milliseconds /= 60;
ULLONG minutes = milliseconds % 60;
milliseconds /= 60;
ULLONG hours = milliseconds;
snprintf(buf, bufsize, "%02" LLU_M ":%02" LLU_M ":%02" LLU_M ":%03" LLU_M,
hours, minutes, seconds, ms);
}
/* Check if output format is text-based (requires OCR for bitmap subtitles) */
static int is_text_output_format(enum ccx_output_format format)
{
return (format == CCX_OF_SRT || format == CCX_OF_SSA ||
format == CCX_OF_WEBVTT || format == CCX_OF_TRANSCRIPT ||
format == CCX_OF_SAMI || format == CCX_OF_SMPTETT);
}
/* VOBSUB support: Process VOBSUB track with OCR and output text format */
static void process_vobsub_track_ocr(struct matroska_ctx *mkv_ctx, struct matroska_sub_track *track)
{
if (track->sentence_count == 0)
{
mprint("\nNo VOBSUB subtitles to process");
return;
}
/* Check if OCR is available */
if (!vobsub_ocr_available())
{
fatal(EXIT_NOT_CLASSIFIED,
"VOBSUB to text conversion requires OCR support.\n"
"Please rebuild CCExtractor with -DWITH_OCR=ON or use raw output (--out=idx)");
}
/* Initialize VOBSUB decoder */
struct vobsub_ctx *vob_ctx = init_vobsub_decoder();
if (!vob_ctx)
{
fatal(EXIT_NOT_CLASSIFIED,
"VOBSUB to text conversion requires OCR, but initialization failed.\n"
"Please ensure Tesseract is installed with language data.");
}
/* Parse palette from track header (CodecPrivate) */
if (track->header)
{
vobsub_parse_palette(vob_ctx, track->header);
}
mprint("\nProcessing VOBSUB track with OCR (%d subtitles)", track->sentence_count);
/* Get encoder context for output */
struct encoder_ctx *enc_ctx = update_encoder_list(mkv_ctx->ctx);
/* Process each subtitle */
for (int i = 0; i < track->sentence_count; i++)
{
struct matroska_sub_sentence *sentence = track->sentences[i];
mkv_ctx->sentence_count++;
/* Calculate end time (use next subtitle start if not specified) */
ULLONG end_time = sentence->time_end;
if (end_time == 0 && i + 1 < track->sentence_count)
{
end_time = track->sentences[i + 1]->time_start - 1;
}
else if (end_time == 0)
{
end_time = sentence->time_start + 5000; /* Default 5 second duration */
}
/* Decode SPU and run OCR */
struct cc_subtitle sub;
memset(&sub, 0, sizeof(sub));
int ret = vobsub_decode_spu(vob_ctx,
(unsigned char *)sentence->text,
sentence->text_size,
sentence->time_start,
end_time,
&sub);
if (ret == 0 && sub.got_output)
{
/* Encode the subtitle to output format */
encode_sub(enc_ctx, &sub);
/* Free subtitle data */
if (sub.data)
{
struct cc_bitmap *rect = (struct cc_bitmap *)sub.data;
for (int j = 0; j < sub.nb_data; j++)
{
if (rect[j].data0)
free(rect[j].data0);
if (rect[j].data1)
free(rect[j].data1);
#ifdef ENABLE_OCR
if (rect[j].ocr_text)
free(rect[j].ocr_text);
#endif
}
free(sub.data);
}
}
/* Progress indicator */
if ((i + 1) % 50 == 0 || i + 1 == track->sentence_count)
{
mprint("\rProcessing VOBSUB: %d/%d subtitles", i + 1, track->sentence_count);
}
}
delete_vobsub_decoder(&vob_ctx);
mprint("\nVOBSUB OCR processing complete");
}
/* VOBSUB support: Save VOBSUB track to .idx and .sub files */
#define VOBSUB_BLOCK_SIZE 2048
static void save_vobsub_track(struct matroska_ctx *mkv_ctx, struct matroska_sub_track *track)
{
if (track->sentence_count == 0)
{
mprint("\nNo VOBSUB subtitles to write");
return;
}
// Generate base filename (without extension)
const char *lang_to_use = track->lang_ietf ? track->lang_ietf : track->lang;
const char *basename = get_basename(mkv_ctx->filename);
size_t needed = strlen(basename) + strlen(lang_to_use) + 32;
char *base_filename = malloc(needed);
if (base_filename == NULL)
fatal(EXIT_NOT_ENOUGH_MEMORY, "In save_vobsub_track: Out of memory.");
if (track->lang_index == 0)
snprintf(base_filename, needed, "%s_%s", basename, lang_to_use);
else
snprintf(base_filename, needed, "%s_%s_" LLD, basename, lang_to_use, track->lang_index);
// Create .sub filename
char *sub_filename = malloc(needed + 5);
if (sub_filename == NULL)
fatal(EXIT_NOT_ENOUGH_MEMORY, "In save_vobsub_track: Out of memory.");
snprintf(sub_filename, needed + 5, "%s.sub", base_filename);
// Create .idx filename
char *idx_filename = malloc(needed + 5);
if (idx_filename == NULL)
fatal(EXIT_NOT_ENOUGH_MEMORY, "In save_vobsub_track: Out of memory.");
snprintf(idx_filename, needed + 5, "%s.idx", base_filename);
mprint("\nOutput files: %s, %s", idx_filename, sub_filename);
// Open .sub file
int sub_desc;
#ifdef WIN32
sub_desc = open(sub_filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IREAD | S_IWRITE);
#else
sub_desc = open(sub_filename, O_WRONLY | O_CREAT | O_TRUNC, S_IWUSR | S_IRUSR);
#endif
if (sub_desc < 0)
{
mprint("\nError: Cannot create .sub file");
free(base_filename);
free(sub_filename);
free(idx_filename);
return;
}
// Open .idx file
int idx_desc;
#ifdef WIN32
idx_desc = open(idx_filename, O_WRONLY | O_CREAT | O_TRUNC, S_IREAD | S_IWRITE);
#else
idx_desc = open(idx_filename, O_WRONLY | O_CREAT | O_TRUNC, S_IWUSR | S_IRUSR);
#endif
if (idx_desc < 0)
{
mprint("\nError: Cannot create .idx file");
close(sub_desc);
free(base_filename);
free(sub_filename);
free(idx_filename);
return;
}
// Write .idx header (from CodecPrivate)
if (track->header != NULL)
write_wrapped(idx_desc, track->header, strlen(track->header));
// Add language identifier line
char lang_line[128];
snprintf(lang_line, sizeof(lang_line), "\nid: %s, index: 0\n", lang_to_use);
write_wrapped(idx_desc, lang_line, strlen(lang_line));
// Buffer for PS/PES headers and padding
unsigned char header_buf[32];
unsigned char zero_buf[VOBSUB_BLOCK_SIZE];
memset(zero_buf, 0, VOBSUB_BLOCK_SIZE);
ULLONG file_pos = 0;
// Write each subtitle
for (int i = 0; i < track->sentence_count; i++)
{
struct matroska_sub_sentence *sentence = track->sentences[i];
mkv_ctx->sentence_count++;
// Convert timestamp to 90kHz PTS
ULLONG pts_90khz = sentence->time_start * 90;
// Write timestamp entry to .idx
char timestamp[32];
generate_vobsub_timestamp(timestamp, sizeof(timestamp), sentence->time_start);
char idx_entry[128];
snprintf(idx_entry, sizeof(idx_entry), "timestamp: %s, filepos: %09" LLX_M "\n",
timestamp, file_pos);
write_wrapped(idx_desc, idx_entry, strlen(idx_entry));
// Generate PS Pack header (14 bytes)
generate_ps_pack_header(header_buf, pts_90khz);
write_wrapped(sub_desc, (char *)header_buf, 14);
// Generate PES header (15 bytes)
int pes_header_len = generate_pes_header(header_buf, pts_90khz, sentence->text_size, 0);
write_wrapped(sub_desc, (char *)header_buf, pes_header_len);
// Write SPU data
write_wrapped(sub_desc, sentence->text, sentence->text_size);
// Calculate bytes written and pad to block boundary
ULLONG bytes_written = 14 + pes_header_len + sentence->text_size;
ULLONG padding_needed = VOBSUB_BLOCK_SIZE - (bytes_written % VOBSUB_BLOCK_SIZE);
if (padding_needed < VOBSUB_BLOCK_SIZE)
{
write_wrapped(sub_desc, (char *)zero_buf, padding_needed);
bytes_written += padding_needed;
}
file_pos += bytes_written;
}
close(sub_desc);
close(idx_desc);
free(base_filename);
free(sub_filename);
free(idx_filename);
}
void save_sub_track(struct matroska_ctx *mkv_ctx, struct matroska_sub_track *track)
{
char *filename;
int desc;
// VOBSUB tracks need special handling
if (track->codec_id == MATROSKA_TRACK_SUBTITLE_CODEC_ID_VOBSUB)
{
// Check if user wants text output (SRT, SSA, WebVTT, etc.)
if (ccx_options.write_format_rewritten &&
is_text_output_format(ccx_options.enc_cfg.write_format))
{
// Use OCR to convert VOBSUB to text
process_vobsub_track_ocr(mkv_ctx, track);
}
else
{
// Output raw idx/sub files
save_vobsub_track(mkv_ctx, track);
}
return;
}
if (mkv_ctx->ctx->cc_to_stdout == CCX_TRUE)
{
desc = 1; // file descriptor of stdout
@@ -1358,11 +1745,6 @@ void save_sub_track(struct matroska_ctx *mkv_ctx, struct matroska_sub_track *tra
if (track->header != NULL)
write_wrapped(desc, track->header, strlen(track->header));
if (track->codec_id == MATROSKA_TRACK_SUBTITLE_CODEC_ID_VOBSUB)
{
mprint("\nError: VOBSUB not supported");
}
for (int i = 0; i < track->sentence_count; i++)
{
struct matroska_sub_sentence *sentence = track->sentences[i];
@@ -1497,10 +1879,6 @@ void save_sub_track(struct matroska_ctx *mkv_ctx, struct matroska_sub_track *tra
free(timestamp_start);
free(timestamp_end);
}
else if (track->codec_id == MATROSKA_TRACK_SUBTITLE_CODEC_ID_VOBSUB)
{
// TODO: Add support for VOBSUB
}
}
}
@@ -1572,6 +1950,9 @@ void matroska_parse(struct matroska_ctx *mkv_ctx)
{
code <<= 8;
code += mkv_read_byte(file);
// Check for EOF after reading - feof() is only set after a failed read
if (feof(file))
break;
code_len++;
switch (code)
@@ -1623,8 +2004,13 @@ int matroska_loop(struct lib_ccx_ctx *ctx)
{
if (ccx_options.write_format_rewritten)
{
mprint(MATROSKA_WARNING "You are using --out=<format>, but Matroska parser extract subtitles in a recorded format\n");
mprint("--out=<format> will be ignored\n");
/* Note: For VOBSUB tracks, text output formats (SRT, SSA, etc.) are
* supported via OCR. For other subtitle types, the native format is used. */
if (!is_text_output_format(ccx_options.enc_cfg.write_format))
{
mprint(MATROSKA_WARNING "You are using --out=<format>, but Matroska parser extracts subtitles in their recorded format\n");
mprint("--out=<format> will be ignored for non-VOBSUB tracks\n");
}
}
// Don't need generated input file

View File

@@ -5,26 +5,31 @@
#if (defined(WIN32) || defined(_WIN32_WCE)) && (defined(__MINGW32__) || !defined(__GNUC__))
#define LLD_M "I64d"
#define LLU_M "I64u"
#define LLX_M "I64x"
#define LLD "%I64d"
#define LLU "%I64u"
#elif defined(__SYMBIAN32__)
#define LLD_M "d"
#define LLU_M "u"
#define LLX_M "x"
#define LLD "%d"
#define LLU "%u"
#elif defined(__DARWIN__) || defined(__APPLE__)
#define LLD_M "lld"
#define LLU_M "llu"
#define LLX_M "llx"
#define LLD "%lld"
#define LLU "%llu"
#elif defined(_LP64) /* Unix 64 bits */
#define LLD_M "ld"
#define LLU_M "lu"
#define LLX_M "lx"
#define LLD "%ld"
#define LLU "%lu"
#else /* Unix 32 bits */
#define LLD_M "lld"
#define LLU_M "llu"
#define LLX_M "llx"
#define LLD "%lld"
#define LLU "%llu"
#endif

View File

@@ -12,6 +12,7 @@
#include "ccx_mp4.h"
#include "activity.h"
#include "ccx_dtvcc.h"
#include "vobsub_decoder.h"
#define MEDIA_TYPE(type, subtype) (((u64)(type) << 32) + (subtype))
@@ -25,15 +26,22 @@
#define GF_ISOM_SUBTYPE_HVC1 GF_4CC('h', 'v', 'c', '1')
#endif
static short bswap16(short v)
// VOBSUB subtype (mp4s or MPEG)
#ifndef GF_ISOM_SUBTYPE_MPEG4
#define GF_ISOM_SUBTYPE_MPEG4 GF_4CC('M', 'P', 'E', 'G')
#endif
static int16_t bswap16(int16_t v)
{
return ((v >> 8) & 0x00FF) | ((v << 8) & 0xFF00);
}
static long bswap32(long v)
static int32_t bswap32(int32_t v)
{
// For 0x12345678 returns 78563412
long swapped = ((v & 0xFF) << 24) | ((v & 0xFF00) << 8) | ((v & 0xFF0000) >> 8) | ((v & 0xFF000000) >> 24);
// Use int32_t instead of long for consistent behavior across platforms
// (long is 4 bytes on Windows x64 but 8 bytes on Linux x64)
int32_t swapped = ((v & 0xFF) << 24) | ((v & 0xFF00) << 8) | ((v & 0xFF0000) >> 8) | ((v & 0xFF000000) >> 24);
return swapped;
}
static struct
@@ -76,10 +84,10 @@ static int process_avc_sample(struct lib_ccx_ctx *ctx, u32 timescale, GF_AVCConf
nal_length = s->data[i];
break;
case 2:
nal_length = bswap16(*(short *)&s->data[i]);
nal_length = bswap16(*(int16_t *)&s->data[i]);
break;
case 4:
nal_length = bswap32(*(long *)&s->data[i]);
nal_length = bswap32(*(int32_t *)&s->data[i]);
break;
}
const u32 previous_index = i;
@@ -145,10 +153,10 @@ static int process_hevc_sample(struct lib_ccx_ctx *ctx, u32 timescale, GF_HEVCCo
nal_length = s->data[i];
break;
case 2:
nal_length = bswap16(*(short *)&s->data[i]);
nal_length = bswap16(*(int16_t *)&s->data[i]);
break;
case 4:
nal_length = bswap32(*(long *)&s->data[i]);
nal_length = bswap32(*(int32_t *)&s->data[i]);
break;
default:
mprint("Unexpected nal_unit_size %u in HEVC config\n", c->nal_unit_size);
@@ -202,6 +210,13 @@ static int process_xdvb_track(struct lib_ccx_ctx *ctx, const char *basename, GF_
dec_ctx = update_decoder_list(ctx);
enc_ctx = update_encoder_list(ctx);
// Set buffer data type to CCX_PES for MP4/MOV MPEG-2 tracks.
// This ensures cb_field counters are not incremented in do_cb(),
// which is correct because container formats associate captions
// with the frame's PTS directly.
dec_ctx->in_bufferdatatype = CCX_PES;
if ((sample_count = gf_isom_get_sample_count(f, track)) < 1)
{
return 0;
@@ -249,6 +264,12 @@ static int process_avc_track(struct lib_ccx_ctx *ctx, const char *basename, GF_I
dec_ctx = update_decoder_list(ctx);
// Set buffer data type to CCX_H264 for MP4/MOV AVC tracks.
// This ensures cb_field counters are not incremented in do_cb(),
// which is correct because container formats associate captions
// with the frame's PTS directly.
dec_ctx->in_bufferdatatype = CCX_H264;
if ((sample_count = gf_isom_get_sample_count(f, track)) < 1)
{
return 0;
@@ -326,6 +347,12 @@ static int process_hevc_track(struct lib_ccx_ctx *ctx, const char *basename, GF_
// Enable HEVC mode
dec_ctx->avc_ctx->is_hevc = 1;
// Set buffer data type to CCX_H264 for MP4/MOV HEVC tracks.
// This ensures cb_field counters are not incremented in do_cb(),
// which is correct because container formats associate captions
// with the frame's PTS directly.
dec_ctx->in_bufferdatatype = CCX_H264;
if ((sample_count = gf_isom_get_sample_count(f, track)) < 1)
{
return 0;
@@ -391,6 +418,144 @@ static int process_hevc_track(struct lib_ccx_ctx *ctx, const char *basename, GF_
return status;
}
static int process_vobsub_track(struct lib_ccx_ctx *ctx, GF_ISOFile *f, u32 track, struct cc_subtitle *sub)
{
u32 timescale, i, sample_count;
int status = 0;
struct lib_cc_decode *dec_ctx = NULL;
struct encoder_ctx *enc_ctx = NULL;
struct vobsub_ctx *vob_ctx = NULL;
dec_ctx = update_decoder_list(ctx);
enc_ctx = update_encoder_list(ctx);
if ((sample_count = gf_isom_get_sample_count(f, track)) < 1)
{
return 0;
}
timescale = gf_isom_get_media_timescale(f, track);
/* Check if OCR is available */
if (!vobsub_ocr_available())
{
fatal(EXIT_NOT_CLASSIFIED,
"VOBSUB to text conversion requires OCR support.\n"
"Please rebuild CCExtractor with -DWITH_OCR=ON");
}
/* Initialize VOBSUB decoder */
vob_ctx = init_vobsub_decoder();
if (!vob_ctx)
{
fatal(EXIT_NOT_CLASSIFIED,
"VOBSUB decoder initialization failed.\n"
"Please ensure Tesseract is installed with language data.");
}
/* Try to get decoder config for palette info */
GF_GenericSampleDescription *gdesc = gf_isom_get_generic_sample_description(f, track, 1);
if (gdesc && gdesc->extension_buf && gdesc->extension_buf_size > 0)
{
/* The extension buffer may contain an idx-like header with palette */
char *header = malloc(gdesc->extension_buf_size + 1);
if (header)
{
memcpy(header, gdesc->extension_buf, gdesc->extension_buf_size);
header[gdesc->extension_buf_size] = '\0';
vobsub_parse_palette(vob_ctx, header);
free(header);
}
}
if (gdesc)
free(gdesc);
mprint("Processing VOBSUB track (%u samples)\n", sample_count);
for (i = 0; i < sample_count; i++)
{
u32 sdi;
GF_ISOSample *s = gf_isom_get_sample(f, track, i + 1, &sdi);
if (s != NULL)
{
s32 signed_cts = (s32)s->CTS_Offset;
LLONG start_time_ms = (LLONG)((s->DTS + signed_cts) * 1000) / timescale;
/* Calculate end time from next sample if available */
LLONG end_time_ms = 0;
if (i + 1 < sample_count)
{
u32 next_sdi;
GF_ISOSample *next_s = gf_isom_get_sample(f, track, i + 2, &next_sdi);
if (next_s)
{
s32 next_signed_cts = (s32)next_s->CTS_Offset;
end_time_ms = (LLONG)((next_s->DTS + next_signed_cts) * 1000) / timescale;
gf_isom_sample_del(&next_s);
}
}
if (end_time_ms == 0)
end_time_ms = start_time_ms + 5000; /* Default 5 second duration */
set_current_pts(dec_ctx->timing, (s->DTS + signed_cts) * MPEG_CLOCK_FREQ / timescale);
set_fts(dec_ctx->timing);
/* Decode SPU and run OCR */
struct cc_subtitle vob_sub;
memset(&vob_sub, 0, sizeof(vob_sub));
int ret = vobsub_decode_spu(vob_ctx,
(unsigned char *)s->data, s->dataLength,
start_time_ms, end_time_ms,
&vob_sub);
if (ret == 0 && vob_sub.got_output)
{
/* Encode the subtitle to output format */
encode_sub(enc_ctx, &vob_sub);
sub->got_output = 1;
/* Free subtitle data */
if (vob_sub.data)
{
struct cc_bitmap *rect = (struct cc_bitmap *)vob_sub.data;
for (int j = 0; j < vob_sub.nb_data; j++)
{
if (rect[j].data0)
free(rect[j].data0);
if (rect[j].data1)
free(rect[j].data1);
#ifdef ENABLE_OCR
if (rect[j].ocr_text)
free(rect[j].ocr_text);
#endif
}
free(vob_sub.data);
}
}
gf_isom_sample_del(&s);
}
int progress = (int)((i * 100) / sample_count);
if (ctx->last_reported_progress != progress)
{
int cur_sec = (int)(get_fts(dec_ctx->timing, dec_ctx->current_field) / 1000);
activity_progress(progress, cur_sec / 60, cur_sec % 60);
ctx->last_reported_progress = progress;
}
}
int cur_sec = (int)(get_fts(dec_ctx->timing, dec_ctx->current_field) / 1000);
activity_progress(100, cur_sec / 60, cur_sec % 60);
delete_vobsub_decoder(&vob_ctx);
mprint("VOBSUB processing complete\n");
return status;
}
static char *format_duration(u64 dur, u32 timescale, char *szDur, size_t szDur_size)
{
u32 h, m, s, ms;
@@ -584,7 +749,11 @@ static int process_clcp(struct lib_ccx_ctx *ctx, struct encoder_ctx *enc_ctx,
dbg_print(CCX_DMT_PARSE, "MP4-708: atom skipped (cc_type < 2)\n");
continue;
}
#ifndef DISABLE_RUST
ccxr_dtvcc_process_data(dec_ctx->dtvcc_rust, cc_valid, cc_type, temp[2], temp[3]);
#else
dtvcc_process_data(dec_ctx->dtvcc, (unsigned char *)temp);
#endif
cb_708++;
}
if (ctx->write_format == CCX_OF_MCC)
@@ -722,10 +891,19 @@ int processmp4(struct lib_ccx_ctx *ctx, struct ccx_s_mp4Cfg *cfg, char *file)
if (enc_ctx)
enc_ctx->timing = dec_ctx->timing;
// WARN: otherwise cea-708 will not work
// WARN: otherwise cea-708 will not work
#ifndef DISABLE_RUST
ccxr_dtvcc_set_encoder(dec_ctx->dtvcc_rust, enc_ctx);
#else
dec_ctx->dtvcc->encoder = (void *)enc_ctx;
#endif
memset(&dec_sub, 0, sizeof(dec_sub));
if (file == NULL)
{
mprint("Error: NULL file path provided to processmp4\n");
return -1;
}
mprint("Opening \'%s\': ", file);
#ifdef MP4_DEBUG
gf_log_set_tool_level(GF_LOG_CONTAINER, GF_LOG_DEBUG);
@@ -745,6 +923,7 @@ int processmp4(struct lib_ccx_ctx *ctx, struct ccx_s_mp4Cfg *cfg, char *file)
avc_track_count = 0;
hevc_track_count = 0;
cc_track_count = 0;
u32 vobsub_track_count = 0;
for (i = 0; i < track_count; i++)
{
@@ -760,9 +939,11 @@ int processmp4(struct lib_ccx_ctx *ctx, struct ccx_s_mp4Cfg *cfg, char *file)
avc_track_count++;
if (type == GF_ISOM_MEDIA_VISUAL && (subtype == GF_ISOM_SUBTYPE_HEV1 || subtype == GF_ISOM_SUBTYPE_HVC1))
hevc_track_count++;
if (type == GF_ISOM_MEDIA_SUBPIC && subtype == GF_ISOM_SUBTYPE_MPEG4)
vobsub_track_count++;
}
mprint("MP4: found %u tracks: %u avc, %u hevc and %u cc\n", track_count, avc_track_count, hevc_track_count, cc_track_count);
mprint("MP4: found %u tracks: %u avc, %u hevc, %u cc, %u vobsub\n", track_count, avc_track_count, hevc_track_count, cc_track_count, vobsub_track_count);
for (i = 0; i < track_count; i++)
{
@@ -880,6 +1061,24 @@ int processmp4(struct lib_ccx_ctx *ctx, struct ccx_s_mp4Cfg *cfg, char *file)
}
break;
case MEDIA_TYPE(GF_ISOM_MEDIA_SUBPIC, GF_ISOM_SUBTYPE_MPEG4): // subp:MPEG (VOBSUB)
// If there are multiple VOBSUB tracks, change fd for different tracks
if (vobsub_track_count > 1)
{
switch_output_file(ctx, enc_ctx, i);
}
if (process_vobsub_track(ctx, f, i + 1, &dec_sub) != 0)
{
mprint("Error on process_vobsub_track()\n");
free(dec_ctx->xds_ctx);
return -3;
}
if (dec_sub.got_output)
{
mp4_ret = 1;
}
break;
default:
if (type != GF_ISOM_MEDIA_CLOSED_CAPTION && type != GF_ISOM_MEDIA_SUBT && type != GF_ISOM_MEDIA_TEXT)
break; // ignore non cc track
@@ -1019,9 +1218,14 @@ int processmp4(struct lib_ccx_ctx *ctx, struct ccx_s_mp4Cfg *cfg, char *file)
mprint("Found no HEVC track(s). ");
if (cc_track_count)
mprint("Found %d CC track(s).\n", cc_track_count);
mprint("Found %d CC track(s). ", cc_track_count);
else
mprint("Found no dedicated CC track(s).\n");
mprint("Found no dedicated CC track(s). ");
if (vobsub_track_count)
mprint("Found %d VOBSUB track(s).\n", vobsub_track_count);
else
mprint("\n");
ctx->freport.mp4_cc_track_cnt = cc_track_count;

View File

@@ -103,7 +103,8 @@ int set_nonblocking(int fd);
void connect_to_srv(const char *addr, const char *port, const char *cc_desc, const char *pwd)
{
#ifndef DISABLE_RUST
return ccxr_connect_to_srv(addr, port, cc_desc, pwd);
(void)ccxr_connect_to_srv(addr, port, cc_desc, pwd);
return;
#endif
if (NULL == addr)
{
@@ -137,7 +138,8 @@ void connect_to_srv(const char *addr, const char *port, const char *cc_desc, con
void net_send_header(const unsigned char *data, size_t len)
{
#ifndef DISABLE_RUST
return ccxr_net_send_header(data, len);
(void)ccxr_net_send_header(data, len);
return;
#endif
assert(srv_sd > 0);
@@ -188,7 +190,8 @@ int net_send_cc(const unsigned char *data, int len, void *private_data, struct c
void net_check_conn()
{
#ifndef DISABLE_RUST
return ccxr_net_check_conn();
ccxr_net_check_conn();
return;
#endif
time_t now;
static time_t last_ping = 0;
@@ -252,7 +255,8 @@ void net_send_epg(
const char *category)
{
#ifndef DISABLE_RUST
return ccxr_net_send_epg(start, stop, title, desc, lang, category);
(void)ccxr_net_send_epg(start, stop, title, desc, lang, category);
return;
#endif
size_t st;
size_t sp;

View File

@@ -8,6 +8,11 @@
#include <dirent.h>
#include "ccx_encoders_helpers.h"
#include "ccx_encoders_spupng.h"
#ifdef _WIN32
#include <windows.h>
#elif defined(__APPLE__)
#include <mach-o/dyld.h>
#endif
#include "ocr.h"
struct ocrCtx
@@ -100,6 +105,68 @@ void delete_ocr(void **arg)
freep(arg);
}
/**
* get_executable_directory
*
* Returns the directory containing the executable.
* Returns a pointer to a static buffer, or NULL on failure.
*/
static const char *get_executable_directory(void)
{
static char exe_dir[1024] = {0};
static int initialized = 0;
if (initialized)
return exe_dir[0] ? exe_dir : NULL;
initialized = 1;
#ifdef _WIN32
char exe_path[MAX_PATH];
DWORD len = GetModuleFileNameA(NULL, exe_path, MAX_PATH);
if (len == 0 || len >= MAX_PATH)
return NULL;
// Find the last backslash and truncate there
char *last_sep = strrchr(exe_path, '\\');
if (last_sep)
{
*last_sep = '\0';
strncpy(exe_dir, exe_path, sizeof(exe_dir) - 1);
exe_dir[sizeof(exe_dir) - 1] = '\0';
}
#elif defined(__linux__)
char exe_path[1024];
ssize_t len = readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1);
if (len <= 0)
return NULL;
exe_path[len] = '\0';
char *last_sep = strrchr(exe_path, '/');
if (last_sep)
{
*last_sep = '\0';
strncpy(exe_dir, exe_path, sizeof(exe_dir) - 1);
exe_dir[sizeof(exe_dir) - 1] = '\0';
}
#elif defined(__APPLE__)
char exe_path[1024];
uint32_t size = sizeof(exe_path);
if (_NSGetExecutablePath(exe_path, &size) != 0)
return NULL;
char *last_sep = strrchr(exe_path, '/');
if (last_sep)
{
*last_sep = '\0';
strncpy(exe_dir, exe_path, sizeof(exe_dir) - 1);
exe_dir[sizeof(exe_dir) - 1] = '\0';
}
#endif
return exe_dir[0] ? exe_dir : NULL;
}
/**
* probe_tessdata_location
*
@@ -107,8 +174,10 @@ void delete_ocr(void **arg)
*
* Priority of Tesseract traineddata file search paths:-
* 1. tessdata in TESSDATA_PREFIX, if it is specified. Overrides others
* 2. tessdata in current working directory
* 3. tessdata in /usr/share
* 2. tessdata in executable directory (for bundled tessdata)
* 3. tessdata in current working directory
* 4. tessdata in system locations (/usr/share, etc.)
* 5. tessdata in default Tesseract install location (Windows)
*/
char *probe_tessdata_location(const char *lang)
{
@@ -116,6 +185,7 @@ char *probe_tessdata_location(const char *lang)
const char *paths[] = {
getenv("TESSDATA_PREFIX"),
get_executable_directory(),
"./",
"/usr/share/",
"/usr/local/share/",
@@ -211,6 +281,13 @@ void *init_ocr(int lang_index)
// set PSM mode
TessBaseAPISetPageSegMode(ctx->api, ccx_options.psm);
// Set character blacklist to prevent common OCR errors (e.g. | vs I)
// These characters are rarely used in subtitles but often misrecognized
if (ccx_options.ocr_blacklist)
{
TessBaseAPISetVariable(ctx->api, "tessedit_char_blacklist", "|\\`_~");
}
free(pars_vec);
free(pars_values);
@@ -281,6 +358,176 @@ BOX *ignore_alpha_at_edge(png_byte *alpha, unsigned char *indata, int w, int h,
return cropWindow;
}
/**
* Structure to hold the vertical boundaries of a detected text line.
*/
struct line_bounds
{
int start_y; // Top row of line (inclusive)
int end_y; // Bottom row of line (inclusive)
};
/**
* Detects horizontal text line boundaries in a bitmap by finding rows of
* fully transparent pixels that separate lines of text.
*
* @param alpha Palette alpha values (indexed by pixel value)
* @param indata Bitmap pixel data (palette indices, w*h bytes)
* @param w Image width
* @param h Image height
* @param lines Output: allocated array of line boundaries (caller must free)
* @param num_lines Output: number of lines found
* @param min_gap Minimum consecutive transparent rows to count as line separator
* @return 0 on success, -1 on failure
*/
static int detect_text_lines(png_byte *alpha, unsigned char *indata,
int w, int h,
struct line_bounds **lines, int *num_lines,
int min_gap)
{
if (!alpha || !indata || !lines || !num_lines || w <= 0 || h <= 0)
return -1;
*lines = NULL;
*num_lines = 0;
// Allocate array to track which rows have visible content
int *row_has_content = (int *)malloc(h * sizeof(int));
if (!row_has_content)
return -1;
// Scan each row to determine if it has any visible (non-transparent) pixels
for (int i = 0; i < h; i++)
{
row_has_content[i] = 0;
for (int j = 0; j < w; j++)
{
int index = indata[i * w + j];
if (alpha[index] != 0)
{
row_has_content[i] = 1;
break; // Found visible pixel, no need to check rest of row
}
}
}
// Count lines by finding runs of content rows separated by gaps
int max_lines = (h / 2) + 1; // Conservative upper bound
struct line_bounds *temp_lines = (struct line_bounds *)malloc(max_lines * sizeof(struct line_bounds));
if (!temp_lines)
{
free(row_has_content);
return -1;
}
int line_count = 0;
int in_line = 0;
int line_start = 0;
int gap_count = 0;
for (int i = 0; i < h; i++)
{
if (row_has_content[i])
{
if (!in_line)
{
// Start of a new line
line_start = i;
in_line = 1;
}
gap_count = 0;
}
else
{
if (in_line)
{
gap_count++;
if (gap_count >= min_gap)
{
// End of line found (gap is large enough)
if (line_count < max_lines)
{
temp_lines[line_count].start_y = line_start;
temp_lines[line_count].end_y = i - gap_count;
line_count++;
}
in_line = 0;
gap_count = 0;
}
}
}
}
// Handle last line if we ended while still in a line
if (in_line && line_count < max_lines)
{
temp_lines[line_count].start_y = line_start;
// Find the last row with content
int last_content = h - 1;
while (last_content > line_start && !row_has_content[last_content])
last_content--;
temp_lines[line_count].end_y = last_content;
line_count++;
}
free(row_has_content);
if (line_count == 0)
{
free(temp_lines);
return -1;
}
// Shrink allocation to actual size
*lines = (struct line_bounds *)realloc(temp_lines, line_count * sizeof(struct line_bounds));
if (!*lines)
{
*lines = temp_lines; // Keep original if realloc fails
}
*num_lines = line_count;
return 0;
}
/**
* Performs OCR on a single text line image using PSM 7 (single line mode).
*
* @param ctx OCR context (contains Tesseract API)
* @param line_pix Pre-processed PIX for single line (grayscale, inverted)
* @return Recognized text (caller must free with free()), or NULL on failure
*/
static char *ocr_single_line(struct ocrCtx *ctx, PIX *line_pix)
{
if (!ctx || !ctx->api || !line_pix)
return NULL;
// Save current PSM
int saved_psm = TessBaseAPIGetPageSegMode(ctx->api);
// Set PSM 7 for single line recognition
TessBaseAPISetPageSegMode(ctx->api, 7); // PSM_SINGLE_LINE
// Perform OCR
TessBaseAPISetImage2(ctx->api, line_pix);
BOOL ret = TessBaseAPIRecognize(ctx->api, NULL);
char *text = NULL;
if (!ret)
{
char *tess_text = TessBaseAPIGetUTF8Text(ctx->api);
if (tess_text)
{
text = strdup(tess_text);
TessDeleteText(tess_text);
}
}
// Restore original PSM
TessBaseAPISetPageSegMode(ctx->api, saved_psm);
return text;
}
void debug_tesseract(struct ocrCtx *ctx, char *dump_path)
{
#ifdef OCR_DEBUG
@@ -327,6 +574,8 @@ char *ocr_bitmap(void *arg, png_color *palette, png_byte *alpha, unsigned char *
unsigned int *data, *ppixel;
BOOL tess_ret = FALSE;
struct ocrCtx *ctx = arg;
char *combined_text = NULL; // Used by line-split mode
size_t combined_len = 0; // Used by line-split mode
pix = pixCreate(w, h, 32);
color_pix = pixCreate(w, h, 32);
if (pix == NULL || color_pix == NULL)
@@ -406,6 +655,98 @@ char *ocr_bitmap(void *arg, png_color *palette, png_byte *alpha, unsigned char *
return NULL;
}
// Line splitting mode: detect lines and OCR each separately with PSM 7
if (ccx_options.ocr_line_split && h > 30)
{
struct line_bounds *lines = NULL;
int num_lines = 0;
// Use min_gap of 3 rows to detect line boundaries
if (detect_text_lines(alpha, indata, w, h, &lines, &num_lines, 3) == 0 && num_lines > 1)
{
// Multiple lines detected - process each separately with PSM 7
// (combined_text and combined_len are declared at function scope)
for (int line_idx = 0; line_idx < num_lines; line_idx++)
{
int line_h = lines[line_idx].end_y - lines[line_idx].start_y + 1;
if (line_h <= 0)
continue;
// Extract line region from the grayscale image
BOX *line_box = boxCreate(0, lines[line_idx].start_y,
pixGetWidth(cpix_gs), line_h);
PIX *line_pix_raw = pixClipRectangle(cpix_gs, line_box, NULL);
boxDestroy(&line_box);
if (line_pix_raw)
{
// Add white padding around the line (helps Tesseract with edge characters)
// The image is inverted (dark text on light bg), so add white (255) border
int padding = 10;
PIX *line_pix = pixAddBorderGeneral(line_pix_raw, padding, padding, padding, padding, 255);
pixDestroy(&line_pix_raw);
if (!line_pix)
continue;
char *line_text = ocr_single_line(ctx, line_pix);
pixDestroy(&line_pix);
if (line_text)
{
// Trim trailing whitespace from line
size_t line_len = strlen(line_text);
while (line_len > 0 && (line_text[line_len - 1] == '\n' ||
line_text[line_len - 1] == '\r' ||
line_text[line_len - 1] == ' '))
{
line_text[--line_len] = '\0';
}
if (line_len > 0)
{
// Append to combined result
size_t new_len = combined_len + line_len + 2; // +1 for newline, +1 for null
char *new_combined = (char *)realloc(combined_text, new_len);
if (new_combined)
{
combined_text = new_combined;
if (combined_len > 0)
{
combined_text[combined_len++] = '\n';
}
strcpy(combined_text + combined_len, line_text);
combined_len += line_len;
}
}
free(line_text);
}
}
}
free(lines);
if (combined_text && combined_len > 0)
{
// Successfully processed lines - skip whole-image OCR
// but continue to color detection below
goto line_split_color_detection;
}
// If we got here, line splitting didn't produce results
// Fall through to whole-image OCR
if (combined_text)
free(combined_text);
combined_text = NULL;
}
else
{
// Line detection failed or only 1 line - fall through to whole-image OCR
if (lines)
free(lines);
}
}
// Standard whole-image OCR path
TessBaseAPISetImage2(ctx->api, cpix_gs);
tess_ret = TessBaseAPIRecognize(ctx->api, NULL);
debug_tesseract(ctx, "./temp/");
@@ -448,6 +789,14 @@ char *ocr_bitmap(void *arg, png_color *palette, png_byte *alpha, unsigned char *
fatal(EXIT_NOT_ENOUGH_MEMORY, "In ocr_bitmap: Out of memory allocating text_out.");
}
// Jump target for line-split mode: use combined_text and continue with color detection
if (0)
{
line_split_color_detection:
text_out = combined_text;
combined_text = NULL; // Transfer ownership
}
// Begin color detection
// Using tlt_config.nofontcolor or ccx_options.nofontcolor (true when "--no-fontcolor" parameter used) to skip color detection if not required
// This is also skipped if --no-spupngocr is set since the OCR output won't be used anyway

View File

@@ -14,7 +14,19 @@ void dinit_write(struct ccx_s_write *wb)
return;
}
if (wb->fh > 0)
{
// Check if the file is empty before closing
off_t file_size = lseek(wb->fh, 0, SEEK_END);
close(wb->fh);
// Delete empty output files to avoid generating useless 0-byte files
// This commonly happens with -12 option when one field has no captions
if (file_size == 0 && wb->filename != NULL)
{
unlink(wb->filename);
mprint("Deleted empty output file: %s\n", wb->filename);
}
}
freep(&wb->filename);
freep(&wb->original_filename);
if (wb->with_semaphore && wb->semaphore_filename)

View File

@@ -13,8 +13,11 @@
#include "../lib_hash/sha2.h"
#include <string.h>
#include <stdio.h>
#if __has_include(<utf8proc.h>)
#include <utf8proc.h>
#else
#include <utf8proc/utf8proc.h>
#endif
#ifdef ENABLE_OCR
#include <tesseract/capi.h>
#include <leptonica/allheaders.h>
@@ -398,6 +401,13 @@ void print_usage(void)
mprint(" 12 Sparse text with OSD.\n");
mprint(" 13 Raw line. Treat the image as a single text line,\n");
mprint(" bypassing hacks that are Tesseract-specific.\n");
mprint(" --ocr-line-split: Split subtitle images into lines before OCR.\n");
mprint(" Uses PSM 7 (single text line mode) for each line,\n");
mprint(" which can improve accuracy for multi-line bitmap subtitles\n");
mprint(" (VOBSUB, DVD, DVB).\n");
mprint(" --no-ocr-blacklist: Disable the OCR character blacklist. By default,\n");
mprint(" CCExtractor blacklists characters like |, \\, `, _, ~\n");
mprint(" that are commonly misrecognized (e.g. 'I' as '|').\n");
mprint(" --mkvlang: For MKV subtitles, select which language's caption\n");
mprint(" stream will be processed. e.g. 'eng' for English.\n");
mprint(" Language codes can be either the 3 letters bibliographic\n");

View File

@@ -78,6 +78,30 @@ void detect_stream_type(struct ccx_demuxer *ctx)
ctx->startbytes[7] == 0xf8)
ctx->stream_mode = CCX_SM_MCPOODLESRAW;
}
// Check for SCC (Scenarist Closed Caption) text format
// SCC files start with "Scenarist_SCC V1.0" (18 bytes), optionally with UTF-8 BOM (3 bytes)
if (ctx->stream_mode == CCX_SM_ELEMENTARY_OR_NOT_FOUND)
{
unsigned char *check_buf = ctx->startbytes;
int check_pos = 0;
// Skip UTF-8 BOM if present
if (ctx->startbytes_avail >= 3 &&
ctx->startbytes[0] == 0xEF &&
ctx->startbytes[1] == 0xBB &&
ctx->startbytes[2] == 0xBF)
{
check_buf += 3;
check_pos = 3;
}
if (ctx->startbytes_avail >= check_pos + 18 &&
memcmp(check_buf, "Scenarist_SCC V1.0", 18) == 0)
{
ctx->stream_mode = CCX_SM_SCC;
mprint("Detected SCC (Scenarist Closed Caption) format\n");
}
}
#ifdef WTV_DEBUG
if (ctx->stream_mode == CCX_SM_ELEMENTARY_OR_NOT_FOUND && ctx->startbytes_avail >= 6)
{

View File

@@ -434,10 +434,21 @@ void remap_g0_charset(uint8_t c)
{
if (c != primary_charset.current)
{
if (c >= 56)
{
fprintf(stderr, "- G0 Latin National Subset ID 0x%1x.%1x is out of bounds\n", (c >> 3), (c & 0x7));
return;
}
uint8_t m = G0_LATIN_NATIONAL_SUBSETS_MAP[c];
if (m == 0xff)
{
fprintf(stderr, "- G0 Latin National Subset ID 0x%1x.%1x is not implemented\n", (c >> 3), (c & 0x7));
return;
}
else if (m >= 14)
{
fprintf(stderr, "- G0 Latin National Subset index %d is out of bounds\n", m);
return;
}
else
{
@@ -1392,7 +1403,7 @@ int tlt_process_pes_packet(struct lib_cc_decode *dec_ctx, uint8_t *buffer, uint1
uint8_t pes_ext_flag;
// extension
uint32_t t = 0;
uint16_t i;
uint32_t i;
struct TeletextCtx *ctx = dec_ctx->private_data;
ctx->sentence_cap = sentence_cap;
@@ -1468,6 +1479,9 @@ int tlt_process_pes_packet(struct lib_cc_decode *dec_ctx, uint8_t *buffer, uint1
if (pes_packet_length > size)
pes_packet_length = size;
if (size < 9)
return CCX_OK;
// optional PES header marker bits (10.. ....)
if ((buffer[6] & 0xc0) == 0x80)
{
@@ -1480,8 +1494,16 @@ int tlt_process_pes_packet(struct lib_cc_decode *dec_ctx, uint8_t *buffer, uint1
{
if ((optional_pes_header_included == YES) && ((buffer[7] & 0x80) > 0))
{
ctx->using_pts = YES;
dbg_print(CCX_DMT_TELETEXT, "- PID 0xbd PTS available\n");
if (size < 14)
{
ctx->using_pts = NO;
dbg_print(CCX_DMT_TELETEXT, "- PID 0xbd PTS signaled but packet too short, using TS PCR\n");
}
else
{
ctx->using_pts = YES;
dbg_print(CCX_DMT_TELETEXT, "- PID 0xbd PTS available\n");
}
}
else
{
@@ -1554,11 +1576,17 @@ int tlt_process_pes_packet(struct lib_cc_decode *dec_ctx, uint8_t *buffer, uint1
if (optional_pes_header_included == YES)
i += 3 + optional_pes_header_length;
while (i <= pes_packet_length - 6)
while (i + 2 <= pes_packet_length)
{
uint8_t data_unit_id = buffer[i++];
uint8_t data_unit_len = buffer[i++];
if (i + data_unit_len > pes_packet_length)
{
dbg_print(CCX_DMT_TELETEXT, "- Teletext data unit length %u exceeds PES packet length, stopping.\n", data_unit_len);
break;
}
if ((data_unit_id == DATA_UNIT_EBU_TELETEXT_NONSUBTITLE) || (data_unit_id == DATA_UNIT_EBU_TELETEXT_SUBTITLE))
{
// teletext payload has always size 44 bytes

View File

@@ -6,6 +6,7 @@
#include "dvb_subtitle_decoder.h"
#include "ccx_decoders_isdb.h"
#include "file_buffer.h"
#include <inttypes.h>
#ifdef DEBUG_SAVE_TS_PACKETS
#include <sys/types.h>
@@ -153,12 +154,11 @@ enum ccx_bufferdata_type get_buffer_type(struct cap_info *cinfo)
{
return CCX_TELETEXT;
}
else if (cinfo->stream == CCX_STREAM_TYPE_PRIVATE_MPEG2 && cinfo->codec == CCX_CODEC_ATSC_CC)
{
return CCX_PRIVATE_MPEG2_CC;
}
else if (cinfo->stream == CCX_STREAM_TYPE_PRIVATE_USER_MPEG2 && cinfo->codec == CCX_CODEC_ATSC_CC)
else if ((cinfo->stream == CCX_STREAM_TYPE_PRIVATE_MPEG2 ||
cinfo->stream == CCX_STREAM_TYPE_PRIVATE_USER_MPEG2) &&
cinfo->codec == CCX_CODEC_ATSC_CC)
{
// ATSC CC can be in either private stream type - process both as PES
return CCX_PES;
}
else
@@ -567,17 +567,15 @@ int copy_capbuf_demux_data(struct ccx_demuxer *ctx, struct demuxer_data **data,
if (!cinfo->capbuf || !cinfo->capbuflen)
return -1;
if (ptr->bufferdatatype == CCX_PRIVATE_MPEG2_CC)
{
dump(CCX_DMT_GENERIC_NOTICES, cinfo->capbuf, cinfo->capbuflen, 0, 1);
// Bogus data, so we return something
ptr->buffer[ptr->len++] = 0xFA;
ptr->buffer[ptr->len++] = 0x80;
ptr->buffer[ptr->len++] = 0x80;
return CCX_OK;
}
if (cinfo->codec == CCX_CODEC_TELETEXT)
{
if (cinfo->capbuflen > BUFSIZE - ptr->len)
{
fatal(CCX_COMMON_EXIT_BUG_BUG,
"Teletext packet (%" PRId64 ") larger than remaining buffer (%" PRId64 ").\n",
cinfo->capbuflen, (int64_t)(BUFSIZE - ptr->len));
}
memcpy(ptr->buffer + ptr->len, cinfo->capbuf, cinfo->capbuflen);
ptr->len += cinfo->capbuflen;
return CCX_OK;
@@ -672,7 +670,6 @@ void cinfo_cremation(struct ccx_demuxer *ctx, struct demuxer_data **data)
int copy_payload_to_capbuf(struct cap_info *cinfo, struct ts_payload *payload)
{
int newcapbuflen;
if (cinfo->ignore == CCX_TRUE &&
((cinfo->stream != CCX_STREAM_TYPE_VIDEO_MPEG2 &&
@@ -698,17 +695,22 @@ int copy_payload_to_capbuf(struct cap_info *cinfo, struct ts_payload *payload)
}
// copy payload to capbuf
newcapbuflen = cinfo->capbuflen + payload->length;
if (newcapbuflen > cinfo->capbufsize)
if (payload->length > INT64_MAX - cinfo->capbuflen)
{
unsigned char *new_capbuf = (unsigned char *)realloc(cinfo->capbuf, newcapbuflen);
mprint("Error: capbuf size overflow\n");
return -1;
}
int64_t newcapbuflen = (int64_t)cinfo->capbuflen + payload->length;
if (newcapbuflen > (int64_t)cinfo->capbufsize)
{
unsigned char *new_capbuf = (unsigned char *)realloc(cinfo->capbuf, (size_t)newcapbuflen);
if (!new_capbuf)
return -1;
cinfo->capbuf = new_capbuf;
cinfo->capbufsize = newcapbuflen;
cinfo->capbufsize = newcapbuflen; // Note: capbufsize is int in struct cap_info
}
memcpy(cinfo->capbuf + cinfo->capbuflen, payload->start, payload->length);
cinfo->capbuflen = newcapbuflen;
cinfo->capbuflen = newcapbuflen; // Note: capbuflen is int in struct cap_info
return CCX_OK;
}

View File

@@ -50,8 +50,8 @@ struct EPG_rating
struct EPG_event
{
uint32_t id;
char start_time_string[21]; //"YYYYMMDDHHMMSS +0000" = 20 chars
char end_time_string[21];
char start_time_string[74]; // "YYYYMMDDHHMMSS +0000" = 20 chars, 74 to silence compiler warning
char end_time_string[74];
uint8_t running_status;
uint8_t free_ca_mode;
char ISO_639_language_code[4];

View File

@@ -173,7 +173,7 @@ static void *init_private_data(enum ccx_code_type codec)
case CCX_CODEC_TELETEXT:
return telxcc_init();
case CCX_CODEC_DVB:
return dvbsub_init_decoder(NULL, 0);
return dvbsub_init_decoder(NULL);
default:
return NULL;
}

View File

@@ -399,9 +399,7 @@ int parse_PMT(struct ccx_demuxer *ctx, unsigned char *buf, int len, struct progr
ret = parse_dvb_description(&cnf, es_info, desc_len);
if (ret < 0)
break;
ptr = dvbsub_init_decoder(&cnf, pinfo->initialized_ocr);
if (!pinfo->initialized_ocr)
pinfo->initialized_ocr = 1;
ptr = dvbsub_init_decoder(&cnf);
if (ptr == NULL)
break;
update_capinfo(ctx, elementary_PID, stream_type, CCX_CODEC_DVB, program_number, ptr);
@@ -413,9 +411,18 @@ int parse_PMT(struct ccx_demuxer *ctx, unsigned char *buf, int len, struct progr
{
// if this any generally used video stream tyoe get clashed with ATSC/SCTE standard
// then this code can go in some atsc flag
// Validate ES_info_length against buffer bounds to prevent heap overflow
if (i + 5 + ES_info_length > len)
break;
unsigned char *es_info = buf + i + 5;
for (desc_len = 0; (buf + i + 5 + ES_info_length) > es_info; es_info += desc_len)
unsigned char *es_info_end = buf + i + 5 + ES_info_length;
for (desc_len = 0; es_info_end > es_info; es_info += desc_len)
{
// Need at least 2 bytes for descriptor_tag and desc_len
if (es_info + 2 > es_info_end)
break;
enum ccx_mpeg_descriptor descriptor_tag = (enum ccx_mpeg_descriptor)(*es_info++);
int nb_service;
int is_608;
@@ -439,9 +446,18 @@ int parse_PMT(struct ccx_demuxer *ctx, unsigned char *buf, int len, struct progr
if (IS_FEASIBLE(ctx->codec, ctx->nocodec, CCX_CODEC_TELETEXT) && ES_info_length && stream_type == CCX_STREAM_TYPE_PRIVATE_MPEG2) // MPEG-2 Packetized Elementary Stream packets containing private data
{
// Validate ES_info_length against buffer bounds
if (i + 5 + ES_info_length > len)
continue;
unsigned char *es_info = buf + i + 5;
for (desc_len = 0; (buf + i + 5 + ES_info_length) - es_info; es_info += desc_len)
unsigned char *es_info_end = buf + i + 5 + ES_info_length;
for (desc_len = 0; es_info_end > es_info; es_info += desc_len)
{
// Need at least 2 bytes for descriptor_tag and desc_len
if (es_info + 2 > es_info_end)
break;
enum ccx_mpeg_descriptor descriptor_tag = (enum ccx_mpeg_descriptor)(*es_info++);
desc_len = (*es_info++);
if (!IS_VALID_TELETEXT_DESC(descriptor_tag))
@@ -576,6 +592,15 @@ void ts_buffer_psi_packet(struct ccx_demuxer *ctx)
else if (ccounter == ctx->PID_buffers[pid]->prev_ccounter + 1 || (ctx->PID_buffers[pid]->prev_ccounter == 0x0f && ccounter == 0))
{
ctx->PID_buffers[pid]->prev_ccounter = ccounter;
// Check for integer overflow and reasonable size limit (1MB)
if (ctx->PID_buffers[pid]->buffer_length > 1024 * 1024 ||
payload_length > 1024 * 1024 ||
ctx->PID_buffers[pid]->buffer_length + payload_length > 1024 * 1024)
{
dbg_print(CCX_DMT_GENERIC_NOTICES, "\rWarning: PSI buffer for PID %u exceeded reasonable limit (1MB), discarding.\n", pid);
return;
}
void *tmp = realloc(ctx->PID_buffers[pid]->buffer, ctx->PID_buffers[pid]->buffer_length + payload_length);
if (tmp == NULL)
{
@@ -614,6 +639,10 @@ int parse_PAT(struct ccx_demuxer *ctx)
payload_start = ctx->PID_buffers[0]->buffer + pointer_field + 1;
payload_length = ctx->PID_buffers[0]->buffer_length - (pointer_field + 1);
// Need at least 8 bytes to read header fields
if (payload_length < 8)
return 0;
section_number = payload_start[6];
last_section_number = payload_start[7];

View File

@@ -87,13 +87,11 @@ void EPG_ATSC_decode_ETT_text(uint8_t *offset, uint32_t length, struct EPG_event
for (j = 0; j < number_segments && offset < offset_end; j++)
{
uint8_t compression_type, mode, number_bytes;
uint8_t number_bytes;
if (offset + 3 > offset_end)
return;
compression_type = offset[0];
mode = offset[1];
number_bytes = offset[2];
offset += 3;
@@ -127,7 +125,7 @@ void EPG_ATSC_calc_time(char *output, uint32_t time)
timeinfo.tm_hour = 0;
timeinfo.tm_isdst = -1;
mktime(&timeinfo);
snprintf(output, 21, "%02d%02d%02d%02d%02d%02d +0000", timeinfo.tm_year + 1900, timeinfo.tm_mon + 1, timeinfo.tm_mday, timeinfo.tm_hour, timeinfo.tm_min, timeinfo.tm_sec);
snprintf(output, 74, "%02d%02d%02d%02d%02d%02d +0000", timeinfo.tm_year + 1900, timeinfo.tm_mon + 1, timeinfo.tm_mday, timeinfo.tm_hour, timeinfo.tm_min, timeinfo.tm_sec);
}
// Fills event.start_time_string in XMLTV format with passed DVB time

View File

@@ -179,16 +179,21 @@ void mprint(const char *fmt, ...)
if (!ccx_options.messages_target)
return;
va_start(args, fmt);
if (ccx_options.messages_target == CCX_MESSAGES_STDOUT)
FILE *target = (ccx_options.messages_target == CCX_MESSAGES_STDOUT) ? stdout : stderr;
if (fmt[0] == '\r')
{
vfprintf(stdout, fmt, args);
fflush(stdout);
}
else
{
vfprintf(stderr, fmt, args);
fflush(stderr);
#ifndef _WIN32
fprintf(target, "\r\033[K"); // Clear the line first
fmt++; // Skip the '\r' so only the clean text gets printed next
#endif
}
// Windows (legacy console) does not support ANSI sequences; fallback to standard \r; and vfprintf below handles it the old-fashioned way.
vfprintf(target, fmt, args);
fflush(target);
va_end(args);
}

View File

@@ -0,0 +1,517 @@
/**
* VOBSUB decoder with OCR support
*
* Decodes VOBSUB (DVD bitmap) subtitles from MKV, MP4, or standalone idx/sub files
* and optionally performs OCR to convert to text.
*
* SPU (SubPicture Unit) format:
* - 2 bytes: total SPU size
* - 2 bytes: offset to control sequence
* - RLE-encoded pixel data (interlaced)
* - Control sequence with timing, colors, coordinates
*/
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <stdlib.h>
#include "lib_ccx.h"
#include "vobsub_decoder.h"
#include "ccx_common_common.h"
#include "ccx_decoders_structs.h"
#include "ccx_common_constants.h"
#ifdef ENABLE_OCR
#include "ocr.h"
#endif
#define RGBA(r, g, b, a) (((unsigned)(a) << 24) | ((r) << 16) | ((g) << 8) | (b))
/* Control sequence structure */
struct vobsub_ctrl_seq
{
uint8_t color[4]; /* Color indices */
uint8_t alpha[4]; /* Alpha values */
uint16_t coord[4]; /* x1, x2, y1, y2 */
uint16_t pixoffset[2]; /* Offset to 1st and 2nd graphic line */
uint16_t start_time;
uint16_t stop_time;
};
struct vobsub_ctx
{
uint32_t palette[16]; /* RGBA palette from idx header */
int palette_parsed; /* 1 if palette has been parsed */
struct vobsub_ctrl_seq ctrl;
unsigned char *bitmap; /* Decoded bitmap */
#ifdef ENABLE_OCR
void *ocr_ctx; /* OCR context */
#endif
};
/* Get 4 bits from buffer for RLE decoding */
static int vobsub_get_bits(unsigned char *buffer, uint8_t *nextbyte, int *pos, int *m)
{
int ret;
ret = (*nextbyte & 0xf0) >> 4;
if (*m == 0)
*pos += 1;
*nextbyte = (*nextbyte << 4) | ((*m) ? (buffer[*pos] & 0x0f) : ((buffer[*pos] & 0xf0) >> 4));
*m = (*m + 1) % 2;
return ret;
}
/* RLE decode to get run length and color */
static int vobsub_rle_decode(unsigned char *buffer, int *color, uint8_t *nextbyte, int *pos, int *m)
{
int val = 4;
uint16_t rlen = vobsub_get_bits(buffer, nextbyte, pos, m);
while (rlen < val && val <= 0x40)
{
rlen = (rlen << 4) | vobsub_get_bits(buffer, nextbyte, pos, m);
val = val << 2;
}
*color = rlen & 0x3;
rlen = rlen >> 2;
return rlen;
}
/* Decode bitmap from RLE-encoded SPU data */
static void vobsub_get_bitmap(struct vobsub_ctx *ctx, unsigned char *buffer, size_t buf_size)
{
int w, h, x, lineno;
int pos, color, m;
int len;
uint8_t nextbyte;
unsigned char *buffp;
w = (ctx->ctrl.coord[1] - ctx->ctrl.coord[0]) + 1;
h = (ctx->ctrl.coord[3] - ctx->ctrl.coord[2]) + 1;
if (w <= 0 || h <= 0 || w > 4096 || h > 4096)
{
dbg_print(CCX_DMT_VERBOSE, "VOBSUB: Invalid dimensions w=%d h=%d\n", w, h);
return;
}
pos = ctx->ctrl.pixoffset[0];
if (pos >= (int)buf_size)
{
dbg_print(CCX_DMT_VERBOSE, "VOBSUB: Pixel offset out of bounds\n");
return;
}
m = 0;
nextbyte = buffer[pos];
ctx->bitmap = malloc(w * h);
if (!ctx->bitmap)
return;
memset(ctx->bitmap, 0, w * h);
buffp = ctx->bitmap;
x = 0;
lineno = 0;
/* Decode first field (odd lines in interlaced) */
while (lineno < (h + 1) / 2 && pos < (int)buf_size)
{
len = vobsub_rle_decode(buffer, &color, &nextbyte, &pos, &m);
if (len > (w - x) || len == 0)
len = w - x;
memset(buffp + x, color, len);
x += len;
if (x >= w)
{
x = 0;
++lineno;
buffp += (2 * w); /* Skip 1 line due to interlacing */
if ((m == 1))
{
vobsub_get_bits(buffer, &nextbyte, &pos, &m);
}
}
}
/* Decode second field (even lines) */
if (pos > ctx->ctrl.pixoffset[1])
{
dbg_print(CCX_DMT_VERBOSE, "VOBSUB: Error creating bitmap - overlapping fields\n");
return;
}
pos = ctx->ctrl.pixoffset[1];
if (pos >= (int)buf_size)
{
dbg_print(CCX_DMT_VERBOSE, "VOBSUB: Second field offset out of bounds\n");
return;
}
buffp = ctx->bitmap + w;
x = 0;
lineno = 0;
m = 0;
nextbyte = buffer[pos];
while (lineno < h / 2 && pos < (int)buf_size)
{
len = vobsub_rle_decode(buffer, &color, &nextbyte, &pos, &m);
if (len > (w - x) || len == 0)
len = w - x;
memset(buffp + x, color, len);
x += len;
if (x >= w)
{
x = 0;
++lineno;
buffp += (2 * w);
if ((m == 1))
{
vobsub_get_bits(buffer, &nextbyte, &pos, &m);
}
}
}
}
/* Parse control sequence from SPU data */
static void vobsub_decode_control(struct vobsub_ctx *ctx, unsigned char *buffer, size_t buf_size, uint16_t ctrl_offset)
{
int pos = ctrl_offset;
int pack_end = 0;
uint16_t date, next_ctrl;
memset(&ctx->ctrl, 0, sizeof(ctx->ctrl));
while (pos + 4 <= (int)buf_size && pack_end == 0)
{
date = (buffer[pos] << 8) | buffer[pos + 1];
next_ctrl = (buffer[pos + 2] << 8) | buffer[pos + 3];
if (next_ctrl == pos)
pack_end = 1;
pos += 4;
int seq_end = 0;
while (seq_end == 0 && pos < (int)buf_size)
{
int command = buffer[pos++];
switch (command)
{
case 0x01: /* Start display */
ctx->ctrl.start_time = (date << 10) / 90;
break;
case 0x02: /* Stop display */
ctx->ctrl.stop_time = (date << 10) / 90;
break;
case 0x03: /* SET_COLOR */
if (pos + 2 > (int)buf_size)
break;
ctx->ctrl.color[3] = (buffer[pos] & 0xf0) >> 4;
ctx->ctrl.color[2] = buffer[pos] & 0x0f;
ctx->ctrl.color[1] = (buffer[pos + 1] & 0xf0) >> 4;
ctx->ctrl.color[0] = buffer[pos + 1] & 0x0f;
pos += 2;
break;
case 0x04: /* SET_CONTR (alpha) */
if (pos + 2 > (int)buf_size)
break;
ctx->ctrl.alpha[3] = (buffer[pos] & 0xf0) >> 4;
ctx->ctrl.alpha[2] = buffer[pos] & 0x0f;
ctx->ctrl.alpha[1] = (buffer[pos + 1] & 0xf0) >> 4;
ctx->ctrl.alpha[0] = buffer[pos + 1] & 0x0f;
pos += 2;
break;
case 0x05: /* SET_DAREA (coordinates) */
if (pos + 6 > (int)buf_size)
break;
ctx->ctrl.coord[0] = ((buffer[pos] << 8) | (buffer[pos + 1] & 0xf0)) >> 4;
ctx->ctrl.coord[1] = ((buffer[pos + 1] & 0x0f) << 8) | buffer[pos + 2];
ctx->ctrl.coord[2] = ((buffer[pos + 3] << 8) | (buffer[pos + 4] & 0xf0)) >> 4;
ctx->ctrl.coord[3] = ((buffer[pos + 4] & 0x0f) << 8) | buffer[pos + 5];
pos += 6;
break;
case 0x06: /* SET_DSPXA (pixel offset) */
if (pos + 4 > (int)buf_size)
break;
ctx->ctrl.pixoffset[0] = (buffer[pos] << 8) | buffer[pos + 1];
ctx->ctrl.pixoffset[1] = (buffer[pos + 2] << 8) | buffer[pos + 3];
pos += 4;
break;
case 0x07: /* Extended command */
if (pos + 2 > (int)buf_size)
break;
{
uint16_t skip = (buffer[pos] << 8) | buffer[pos + 1];
pos += skip;
}
break;
case 0xff: /* End of control sequence */
seq_end = 1;
break;
default:
dbg_print(CCX_DMT_VERBOSE, "VOBSUB: Unknown control command 0x%02x\n", command);
break;
}
}
}
}
/* Generate RGBA palette from color/alpha indices using parsed palette */
static void vobsub_generate_rgba_palette(struct vobsub_ctx *ctx, uint32_t *rgba_palette)
{
for (int i = 0; i < 4; i++)
{
if (ctx->ctrl.alpha[i] == 0)
{
rgba_palette[i] = 0; /* Fully transparent */
}
else if (ctx->palette_parsed)
{
/* Use parsed palette from idx header */
uint32_t color = ctx->palette[ctx->ctrl.color[i] & 0x0f];
uint8_t r = (color >> 16) & 0xff;
uint8_t g = (color >> 8) & 0xff;
uint8_t b = color & 0xff;
uint8_t a = ctx->ctrl.alpha[i] * 17; /* Scale 0-15 to 0-255 */
rgba_palette[i] = RGBA(r, g, b, a);
}
else
{
/* Fallback: guess palette (grayscale levels) */
static const uint8_t level_map[4][4] = {
{0xff},
{0x00, 0xff},
{0x00, 0x80, 0xff},
{0x00, 0x55, 0xaa, 0xff},
};
/* Count opaque colors */
int nb_opaque = 0;
for (int j = 0; j < 4; j++)
if (ctx->ctrl.alpha[j] != 0)
nb_opaque++;
if (nb_opaque == 0)
nb_opaque = 1;
if (nb_opaque > 4)
nb_opaque = 4;
int level = level_map[nb_opaque - 1][i < nb_opaque ? i : nb_opaque - 1];
uint8_t a = ctx->ctrl.alpha[i] * 17;
rgba_palette[i] = RGBA(level, level, level, a);
}
}
}
struct vobsub_ctx *init_vobsub_decoder(void)
{
struct vobsub_ctx *ctx = malloc(sizeof(struct vobsub_ctx));
if (!ctx)
return NULL;
memset(ctx, 0, sizeof(struct vobsub_ctx));
#ifdef ENABLE_OCR
ctx->ocr_ctx = init_ocr(1); /* 1 = default language index (English) */
if (!ctx->ocr_ctx)
{
mprint("VOBSUB: Warning - OCR initialization failed\n");
/* Continue anyway - OCR will just not work */
}
#endif
return ctx;
}
int vobsub_parse_palette(struct vobsub_ctx *ctx, const char *idx_header)
{
if (!ctx || !idx_header)
return -1;
/* Find "palette:" line */
const char *palette_line = strstr(idx_header, "palette:");
if (!palette_line)
{
dbg_print(CCX_DMT_VERBOSE, "VOBSUB: No palette line found in idx header\n");
return -1;
}
palette_line += 8; /* Skip "palette:" */
/* Skip whitespace */
while (*palette_line == ' ' || *palette_line == '\t')
palette_line++;
/* Parse 16 hex RGB colors */
for (int i = 0; i < 16; i++)
{
unsigned int color;
if (sscanf(palette_line, "%x", &color) != 1)
{
dbg_print(CCX_DMT_VERBOSE, "VOBSUB: Failed to parse palette color %d\n", i);
break;
}
ctx->palette[i] = color;
/* Skip to next color (past comma and whitespace) */
while (*palette_line && *palette_line != ',' && *palette_line != '\n')
palette_line++;
if (*palette_line == ',')
palette_line++;
while (*palette_line == ' ' || *palette_line == '\t')
palette_line++;
}
ctx->palette_parsed = 1;
dbg_print(CCX_DMT_VERBOSE, "VOBSUB: Parsed palette from idx header\n");
return 0;
}
int vobsub_decode_spu(struct vobsub_ctx *ctx,
unsigned char *spu_data, size_t spu_size,
long long start_time, long long end_time,
struct cc_subtitle *sub)
{
if (!ctx || !spu_data || spu_size < 4 || !sub)
return -1;
/* Parse SPU header */
uint16_t size_spu = (spu_data[0] << 8) | spu_data[1];
uint16_t ctrl_offset = (spu_data[2] << 8) | spu_data[3];
if (ctrl_offset > spu_size || size_spu > spu_size)
{
dbg_print(CCX_DMT_VERBOSE, "VOBSUB: Invalid SPU header (size=%u, ctrl=%u, buf=%zu)\n",
size_spu, ctrl_offset, spu_size);
return -1;
}
/* Parse control sequence */
vobsub_decode_control(ctx, spu_data, spu_size, ctrl_offset);
/* Free any previous bitmap */
if (ctx->bitmap)
{
free(ctx->bitmap);
ctx->bitmap = NULL;
}
/* Decode bitmap */
vobsub_get_bitmap(ctx, spu_data, spu_size);
if (!ctx->bitmap)
{
dbg_print(CCX_DMT_VERBOSE, "VOBSUB: Failed to decode bitmap\n");
return -1;
}
/* Build cc_subtitle structure */
int w = (ctx->ctrl.coord[1] - ctx->ctrl.coord[0]) + 1;
int h = (ctx->ctrl.coord[3] - ctx->ctrl.coord[2]) + 1;
if (w <= 0 || h <= 0)
{
dbg_print(CCX_DMT_VERBOSE, "VOBSUB: Invalid bitmap dimensions\n");
free(ctx->bitmap);
ctx->bitmap = NULL;
return -1;
}
sub->type = CC_BITMAP;
sub->nb_data = 1;
sub->got_output = 1;
struct cc_bitmap *rect = malloc(sizeof(struct cc_bitmap));
if (!rect)
{
free(ctx->bitmap);
ctx->bitmap = NULL;
return -1;
}
memset(rect, 0, sizeof(struct cc_bitmap));
sub->data = rect;
sub->datatype = CC_DATATYPE_GENERIC;
sub->start_time = start_time;
sub->end_time = end_time > 0 ? end_time : start_time + ctx->ctrl.stop_time;
/* Copy bitmap data */
rect->data0 = malloc(w * h);
if (!rect->data0)
{
free(rect);
sub->data = NULL;
free(ctx->bitmap);
ctx->bitmap = NULL;
return -1;
}
memcpy(rect->data0, ctx->bitmap, w * h);
/* Generate RGBA palette */
rect->data1 = malloc(1024); /* Space for 256 colors */
if (!rect->data1)
{
free(rect->data0);
free(rect);
sub->data = NULL;
free(ctx->bitmap);
ctx->bitmap = NULL;
return -1;
}
memset(rect->data1, 0, 1024);
vobsub_generate_rgba_palette(ctx, (uint32_t *)rect->data1);
rect->nb_colors = 4;
rect->x = ctx->ctrl.coord[0];
rect->y = ctx->ctrl.coord[2];
rect->w = w;
rect->h = h;
rect->linesize0 = w;
#ifdef ENABLE_OCR
/* Run OCR if available */
if (ctx->ocr_ctx)
{
char *ocr_str = NULL;
int ret = ocr_rect(ctx->ocr_ctx, rect, &ocr_str, 0, 1); /* quantmode=1 */
if (ret >= 0 && ocr_str)
{
rect->ocr_text = ocr_str;
}
}
#endif
free(ctx->bitmap);
ctx->bitmap = NULL;
return 0;
}
int vobsub_ocr_available(void)
{
#ifdef ENABLE_OCR
return 1;
#else
return 0;
#endif
}
void delete_vobsub_decoder(struct vobsub_ctx **ctx)
{
if (!ctx || !*ctx)
return;
struct vobsub_ctx *c = *ctx;
#ifdef ENABLE_OCR
if (c->ocr_ctx)
delete_ocr(&c->ocr_ctx);
#endif
if (c->bitmap)
free(c->bitmap);
free(c);
*ctx = NULL;
}

View File

@@ -0,0 +1,53 @@
#ifndef VOBSUB_DECODER_H
#define VOBSUB_DECODER_H
#include "ccx_decoders_structs.h"
/**
* VOBSUB decoder context - opaque structure
*/
struct vobsub_ctx;
/**
* Initialize VOBSUB decoder context
* @return Pointer to context, or NULL on failure
*/
struct vobsub_ctx *init_vobsub_decoder(void);
/**
* Parse palette from idx header string (e.g., from MKV CodecPrivate)
* Looks for "palette:" line and parses 16 hex RGB colors
* @param ctx VOBSUB decoder context
* @param idx_header The idx header string containing palette info
* @return 0 on success, -1 on failure
*/
int vobsub_parse_palette(struct vobsub_ctx *ctx, const char *idx_header);
/**
* Decode single SPU packet and optionally perform OCR
* @param ctx VOBSUB decoder context
* @param spu_data Raw SPU data (starting with 2-byte size)
* @param spu_size Size of SPU data
* @param start_time Start time in milliseconds
* @param end_time End time in milliseconds (0 if unknown)
* @param sub Output subtitle structure
* @return 0 on success, -1 on error
*/
int vobsub_decode_spu(struct vobsub_ctx *ctx,
unsigned char *spu_data, size_t spu_size,
long long start_time, long long end_time,
struct cc_subtitle *sub);
/**
* Check if VOBSUB OCR is available (compiled with OCR support)
* @return 1 if OCR available, 0 otherwise
*/
int vobsub_ocr_available(void);
/**
* Free VOBSUB decoder context and resources
* @param ctx Pointer to context pointer (will be set to NULL)
*/
void delete_vobsub_decoder(struct vobsub_ctx **ctx);
#endif /* VOBSUB_DECODER_H */

44
src/rust/Cargo.lock generated
View File

@@ -161,6 +161,12 @@ version = "2.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3"
[[package]]
name = "by_address"
version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "64fa3c856b712db6612c019f14756e64e4bcea13337a6b33b696333a9eaa2d06"
[[package]]
name = "camino"
version = "1.2.1"
@@ -355,21 +361,18 @@ dependencies = [
"windows-sys 0.61.2",
]
[[package]]
name = "fast-srgb8"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dd2e7510819d6fbf51a5545c8f922716ecfb14df168a3242f7d33e0239efe6a1"
[[package]]
name = "fastrand"
version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
[[package]]
name = "find-crate"
version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "59a98bbaacea1c0eb6a0876280051b892eb73594fd90cf3b20e9c817029c57d2"
dependencies = [
"toml",
]
[[package]]
name = "form_urlencoded"
version = "1.2.2"
@@ -819,26 +822,26 @@ checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
[[package]]
name = "palette"
version = "0.6.1"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f9cd68f7112581033f157e56c77ac4a5538ec5836a2e39284e65bd7d7275e49"
checksum = "4cbf71184cc5ecc2e4e1baccdb21026c20e5fc3dcf63028a086131b3ab00b6e6"
dependencies = [
"approx",
"num-traits",
"fast-srgb8",
"palette_derive",
"phf",
]
[[package]]
name = "palette_derive"
version = "0.6.1"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05eedf46a8e7c27f74af0c9cfcdb004ceca158cb1b918c6f68f8d7a549b3e427"
checksum = "f5030daf005bface118c096f510ffb781fc28f9ab6a32ab224d8631be6851d30"
dependencies = [
"find-crate",
"by_address",
"proc-macro2",
"quote",
"syn 1.0.109",
"syn 2.0.111",
]
[[package]]
@@ -1436,15 +1439,6 @@ dependencies = [
"zerovec",
]
[[package]]
name = "toml"
version = "0.5.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f4f7f0dd8d50a853a531c426359045b1998f04219d88799810762cd4ad314234"
dependencies = [
"serde",
]
[[package]]
name = "toml_datetime"
version = "0.7.3"

View File

@@ -13,7 +13,7 @@ crate-type = ["staticlib"]
[dependencies]
log = "0.4.26"
env_logger = "0.8.4"
palette = "0.6.1"
palette = "0.7"
tesseract-sys = { version = "0.5.15", optional = true, default-features = false }
leptonica-sys = { version = "= 0.4.6", optional = true, default-features = false }
clap = { version = "4.5.31", features = ["derive"] }

View File

@@ -84,7 +84,12 @@ fn main() {
{
builder = builder.clang_arg("-DENABLE_HARDSUBX");
// Add FFmpeg include paths for Mac
// Check FFMPEG_INCLUDE_DIR environment variable (works on all platforms)
if let Ok(ffmpeg_include) = env::var("FFMPEG_INCLUDE_DIR") {
builder = builder.clang_arg(format!("-I{}", ffmpeg_include));
}
// Add FFmpeg include paths for Mac (Homebrew)
if cfg!(target_os = "macos") {
// Try common Homebrew paths
if std::path::Path::new("/opt/homebrew/include").exists() {
@@ -98,22 +103,23 @@ fn main() {
if std::path::Path::new(cellar_ffmpeg).exists() {
// Find the FFmpeg version directory
if let Ok(entries) = std::fs::read_dir(cellar_ffmpeg) {
for entry in entries {
if let Ok(entry) = entry {
let include_path = entry.path().join("include");
if include_path.exists() {
builder =
builder.clang_arg(format!("-I{}", include_path.display()));
break;
}
for entry in entries.flatten() {
let include_path = entry.path().join("include");
if include_path.exists() {
builder = builder.clang_arg(format!("-I{}", include_path.display()));
break;
}
}
}
}
}
// Also check environment variable
if let Ok(ffmpeg_include) = env::var("FFMPEG_INCLUDE_DIR") {
builder = builder.clang_arg(format!("-I{}", ffmpeg_include));
// On Linux, try pkg-config to find FFmpeg include paths
if cfg!(target_os = "linux") {
if let Ok(lib) = pkg_config::Config::new().probe("libavcodec") {
for path in lib.include_paths {
builder = builder.clang_arg(format!("-I{}", path.display()));
}
}
}
}

View File

@@ -147,7 +147,11 @@ pub const CCX_DECODER_608_SCREEN_WIDTH: usize = 32;
pub const ONEPASS: usize = 120; // Bytes we can always look ahead without going out of limits
pub const BUFSIZE: usize = 2048 * 1024 + ONEPASS; // 2 Mb plus the safety pass
pub const MAX_CLOSED_CAPTION_DATA_PER_PICTURE: usize = 32;
pub const EIA_708_BUFFER_LENGTH: usize = 2048; // TODO: Find out what the real limit is
/// CEA-708 Service Input Buffer size.
/// Specification minimum is 128 bytes per service, but we use 2048 bytes
/// (16x the minimum) to provide a safety margin for buffer management.
/// Reference: CEA-708-E Section 8.4.3 - Service Input Buffers
pub const EIA_708_BUFFER_LENGTH: usize = 2048;
pub const TS_PACKET_PAYLOAD_LENGTH: usize = 184; // From specs
pub const SUBLINESIZE: usize = 2048; // Max. length of a .srt line - TODO: Get rid of this
pub const STARTBYTESLENGTH: usize = 1024 * 1024;
@@ -278,6 +282,7 @@ pub enum StreamMode {
Gxf = 11,
Mkv = 12,
Mxf = 13,
Scc = 14, // Scenarist Closed Caption input
Autodetect = 16,
}
#[derive(Debug, Eq, Clone, Copy)]

View File

@@ -0,0 +1,385 @@
//! MKV language filtering support.
//!
//! Matroska files support two language code formats:
//! - ISO 639-2 (3-letter bibliographic codes): "eng", "fre", "chi"
//! - BCP 47 / IETF language tags: "en-US", "fr-CA", "zh-Hans"
//!
//! This module provides [`MkvLangFilter`] for parsing and matching language codes.
use std::fmt;
use std::str::FromStr;
/// A filter for matching MKV track languages.
///
/// Supports comma-separated lists of language codes in either:
/// - ISO 639-2 format (3-letter codes like "eng", "fre")
/// - BCP 47 format (tags like "en-US", "fr-CA", "zh-Hans")
///
/// # Examples
///
/// ```
/// use lib_ccxr::common::MkvLangFilter;
///
/// // Single language
/// let filter: MkvLangFilter = "eng".parse().unwrap();
/// assert!(filter.matches("eng", None));
///
/// // Multiple languages
/// let filter: MkvLangFilter = "eng,fre,chi".parse().unwrap();
/// assert!(filter.matches("fre", None));
///
/// // BCP 47 matching
/// let filter: MkvLangFilter = "en-US,fr-CA".parse().unwrap();
/// assert!(filter.matches("eng", Some("en-US")));
/// ```
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct MkvLangFilter {
/// The original input string (used for C FFI)
raw: String,
/// Parsed and validated language codes
codes: Vec<LanguageCode>,
}
/// A single language code, either ISO 639-2 or BCP 47.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct LanguageCode {
/// The normalized (lowercase) code
code: String,
}
/// Error type for invalid language codes.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct InvalidLanguageCode {
/// The invalid code
pub code: String,
/// Description of what's wrong
pub reason: &'static str,
}
impl fmt::Display for InvalidLanguageCode {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "invalid language code '{}': {}", self.code, self.reason)
}
}
impl std::error::Error for InvalidLanguageCode {}
impl LanguageCode {
/// Validates and creates a new language code.
///
/// Accepts:
/// - ISO 639-2 codes: 3 ASCII letters (e.g., "eng", "fre")
/// - BCP 47 tags: primary language with optional subtags separated by hyphens
/// (e.g., "en-US", "fr-CA", "zh-Hans-CN")
///
/// # BCP 47 Structure
/// - Primary language: 2-3 letters
/// - Script (optional): 4 letters (e.g., "Hans", "Latn")
/// - Region (optional): 2 letters or 3 digits (e.g., "US", "419")
/// - Variant (optional): 5-8 alphanumeric characters
pub fn new(code: &str) -> Result<Self, InvalidLanguageCode> {
let code = code.trim();
if code.is_empty() {
return Err(InvalidLanguageCode {
code: code.to_string(),
reason: "empty language code",
});
}
// Check for valid characters (alphanumeric and hyphens only)
if !code.chars().all(|c| c.is_ascii_alphanumeric() || c == '-') {
return Err(InvalidLanguageCode {
code: code.to_string(),
reason: "must contain only ASCII letters, digits, and hyphens",
});
}
// Cannot start or end with hyphen
if code.starts_with('-') || code.ends_with('-') {
return Err(InvalidLanguageCode {
code: code.to_string(),
reason: "cannot start or end with hyphen",
});
}
// Cannot have consecutive hyphens
if code.contains("--") {
return Err(InvalidLanguageCode {
code: code.to_string(),
reason: "cannot have consecutive hyphens",
});
}
// Validate subtag structure
let subtags: Vec<&str> = code.split('-').collect();
// First subtag must be the primary language (2-3 letters)
let primary = subtags[0];
if primary.len() < 2 || primary.len() > 3 {
return Err(InvalidLanguageCode {
code: code.to_string(),
reason: "primary language subtag must be 2-3 letters",
});
}
if !primary.chars().all(|c| c.is_ascii_alphabetic()) {
return Err(InvalidLanguageCode {
code: code.to_string(),
reason: "primary language subtag must contain only letters",
});
}
// Validate subsequent subtags
for subtag in subtags.iter().skip(1) {
if subtag.is_empty() {
return Err(InvalidLanguageCode {
code: code.to_string(),
reason: "empty subtag",
});
}
let len = subtag.len();
let all_alpha = subtag.chars().all(|c| c.is_ascii_alphabetic());
let all_digit = subtag.chars().all(|c| c.is_ascii_digit());
let all_alnum = subtag.chars().all(|c| c.is_ascii_alphanumeric());
// Valid subtag types:
// - Script: 4 letters (e.g., "Hans")
// - Region: 2 letters or 3 digits (e.g., "US", "419")
// - Variant: 5-8 alphanumeric, or 4 starting with digit
// - Extension: single letter followed by more subtags
// - Private use: 'x' followed by 1-8 char subtags
let valid = match len {
1 => subtag.chars().all(|c| c.is_ascii_alphanumeric()), // Extension singleton
2 => all_alpha, // Region (2 letters)
3 => all_alpha || all_digit, // 3 letters or 3 digits
4 => all_alpha || (subtag.chars().next().unwrap().is_ascii_digit() && all_alnum), // Script or variant starting with digit
5..=8 => all_alnum, // Variant
_ => false,
};
if !valid {
return Err(InvalidLanguageCode {
code: code.to_string(),
reason: "invalid subtag format",
});
}
}
Ok(Self {
code: code.to_lowercase(),
})
}
/// Returns the normalized (lowercase) code.
pub fn as_str(&self) -> &str {
&self.code
}
/// Checks if this code matches a track's language.
///
/// Matching rules:
/// 1. Exact match (case-insensitive)
/// 2. Prefix match for BCP 47 (e.g., "en" matches "en-US")
pub fn matches(&self, iso639: &str, bcp47: Option<&str>) -> bool {
let iso639_lower = iso639.to_lowercase();
let bcp47_lower = bcp47.map(|s| s.to_lowercase());
// Exact match on ISO 639-2
if self.code == iso639_lower {
return true;
}
// Exact match on BCP 47
if let Some(ref bcp) = bcp47_lower {
if self.code == *bcp {
return true;
}
}
// Prefix match: "en" matches "en-US", "eng" matches track with bcp47 "en-US"
// The filter code could be a prefix of the track's BCP 47 tag
if let Some(ref bcp) = bcp47_lower {
if bcp.starts_with(&self.code) && bcp[self.code.len()..].starts_with('-') {
return true;
}
// Or the track's BCP 47 could be a prefix of the filter
if self.code.starts_with(bcp.as_str()) && self.code[bcp.len()..].starts_with('-') {
return true;
}
}
false
}
}
impl FromStr for LanguageCode {
type Err = InvalidLanguageCode;
fn from_str(s: &str) -> Result<Self, Self::Err> {
Self::new(s)
}
}
impl fmt::Display for LanguageCode {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.code)
}
}
impl MkvLangFilter {
/// Creates a new filter from a comma-separated list of language codes.
pub fn new(input: &str) -> Result<Self, InvalidLanguageCode> {
let input = input.trim();
if input.is_empty() {
return Err(InvalidLanguageCode {
code: String::new(),
reason: "empty language filter",
});
}
let codes: Result<Vec<LanguageCode>, _> = input.split(',').map(LanguageCode::new).collect();
Ok(Self {
raw: input.to_string(),
codes: codes?,
})
}
/// Returns the raw input string (for C FFI compatibility).
pub fn as_raw_str(&self) -> &str {
&self.raw
}
/// Returns the parsed language codes.
pub fn codes(&self) -> &[LanguageCode] {
&self.codes
}
/// Checks if any of the filter's codes match a track's language.
///
/// # Arguments
/// - `iso639`: The track's ISO 639-2 language code (e.g., "eng")
/// - `bcp47`: The track's BCP 47 language tag, if available (e.g., "en-US")
pub fn matches(&self, iso639: &str, bcp47: Option<&str>) -> bool {
self.codes.iter().any(|code| code.matches(iso639, bcp47))
}
}
impl FromStr for MkvLangFilter {
type Err = InvalidLanguageCode;
fn from_str(s: &str) -> Result<Self, Self::Err> {
Self::new(s)
}
}
impl fmt::Display for MkvLangFilter {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.raw)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_iso639_codes() {
// Valid 3-letter codes
assert!(LanguageCode::new("eng").is_ok());
assert!(LanguageCode::new("fre").is_ok());
assert!(LanguageCode::new("chi").is_ok());
assert!(LanguageCode::new("ENG").is_ok()); // Case insensitive
// 2-letter codes (ISO 639-1 style, valid in BCP 47)
assert!(LanguageCode::new("en").is_ok());
assert!(LanguageCode::new("fr").is_ok());
}
#[test]
fn test_bcp47_codes() {
// Language + region
assert!(LanguageCode::new("en-US").is_ok());
assert!(LanguageCode::new("fr-CA").is_ok());
assert!(LanguageCode::new("pt-BR").is_ok());
// Language + script
assert!(LanguageCode::new("zh-Hans").is_ok());
assert!(LanguageCode::new("zh-Hant").is_ok());
assert!(LanguageCode::new("sr-Latn").is_ok());
// Language + script + region
assert!(LanguageCode::new("zh-Hans-CN").is_ok());
assert!(LanguageCode::new("zh-Hant-TW").is_ok());
// UN M.49 numeric region codes
assert!(LanguageCode::new("es-419").is_ok()); // Latin America
}
#[test]
fn test_invalid_codes() {
// Too short
assert!(LanguageCode::new("a").is_err());
// Invalid characters
assert!(LanguageCode::new("en_US").is_err()); // Underscore not allowed
assert!(LanguageCode::new("en US").is_err()); // Space not allowed
assert!(LanguageCode::new("ça").is_err()); // Non-ASCII
// Invalid structure
assert!(LanguageCode::new("-en").is_err()); // Leading hyphen
assert!(LanguageCode::new("en-").is_err()); // Trailing hyphen
assert!(LanguageCode::new("en--US").is_err()); // Double hyphen
// Empty
assert!(LanguageCode::new("").is_err());
}
#[test]
fn test_filter_multiple_codes() {
let filter = MkvLangFilter::new("eng,fre,chi").unwrap();
assert_eq!(filter.codes().len(), 3);
assert!(filter.matches("eng", None));
assert!(filter.matches("fre", None));
assert!(filter.matches("chi", None));
assert!(!filter.matches("spa", None));
}
#[test]
fn test_filter_bcp47_matching() {
let filter = MkvLangFilter::new("en-US,fr-CA").unwrap();
// Exact BCP 47 match
assert!(filter.matches("eng", Some("en-US")));
assert!(filter.matches("fre", Some("fr-CA")));
// No match
assert!(!filter.matches("eng", Some("en-GB")));
assert!(!filter.matches("eng", None));
}
#[test]
fn test_filter_mixed_formats() {
let filter = MkvLangFilter::new("eng,fr-CA,zh-Hans").unwrap();
assert!(filter.matches("eng", None));
assert!(filter.matches("fre", Some("fr-CA")));
assert!(filter.matches("chi", Some("zh-Hans")));
}
#[test]
fn test_case_insensitivity() {
let filter = MkvLangFilter::new("ENG,FR-CA").unwrap();
assert!(filter.matches("eng", None));
assert!(filter.matches("ENG", None));
assert!(filter.matches("fre", Some("fr-ca")));
assert!(filter.matches("FRE", Some("FR-CA")));
}
#[test]
fn test_raw_string_preserved() {
let filter = MkvLangFilter::new("eng,fre").unwrap();
assert_eq!(filter.as_raw_str(), "eng,fre");
}
}

View File

@@ -18,8 +18,10 @@
mod bitstream;
mod constants;
mod mkv_lang;
mod options;
pub use bitstream::*;
pub use constants::*;
pub use mkv_lang::*;
pub use options::*;

View File

@@ -462,8 +462,13 @@ pub struct Options {
/// (0 = no quantization at all, 1 = CCExtractor's internal,
/// 2 = reduce distinct color count in image for faster results.)
pub ocr_quantmode: u8,
/// The name of the language stream for MKV
pub mkvlang: Option<Language>,
/// If true, split images into lines before OCR (uses PSM 7 for better accuracy)
pub ocr_line_split: bool,
/// If true, use character blacklist to prevent common OCR errors (e.g. | vs I)
pub ocr_blacklist: bool,
/// Language filter for MKV subtitle tracks.
/// Accepts comma-separated ISO 639-2 codes (e.g., "eng,fre") or BCP 47 tags (e.g., "en-US,fr-CA").
pub mkvlang: Option<super::MkvLangFilter>,
/// If true, the video stream will be processed even if we're using a different one for subtitles.
pub analyze_video_stream: bool,
@@ -517,6 +522,10 @@ pub struct Options {
pub multiprogram: bool,
pub out_interval: i32,
pub segment_on_key_frames_only: bool,
/// SCC input framerate: 0=29.97 (default), 1=24, 2=25, 3=30
pub scc_framerate: i32,
/// SCC accurate timing (issue #1120): if true, use bandwidth-aware timing for broadcast compliance
pub scc_accurate_timing: bool,
pub debug_mask: DebugMessageMask,
#[cfg(feature = "with_libcurl")]
@@ -582,6 +591,8 @@ impl Default for Options {
ocr_oem: -1,
psm: 3,
ocr_quantmode: 0, // No quantization - better OCR accuracy for DVB subtitles
ocr_line_split: false, // Don't split images into lines by default
ocr_blacklist: true, // Use character blacklist by default to prevent | vs I errors
mkvlang: Default::default(),
analyze_video_stream: Default::default(),
hardsubx_ocr_mode: Default::default(),
@@ -618,6 +629,8 @@ impl Default for Options {
multiprogram: Default::default(),
out_interval: -1,
segment_on_key_frames_only: Default::default(),
scc_framerate: 0, // 0 = 29.97fps (default)
scc_accurate_timing: false, // Off by default for backwards compatibility (issue #1120)
debug_mask: DebugMessageMask::new(
DebugMessageFlag::GENERIC_NOTICE,
DebugMessageFlag::VERBOSE,

View File

@@ -82,7 +82,6 @@ impl<'a> SendTarget<'a> {
"Unable to connect, address passed is null\n"
);
}
info!("Target address: {}\n", config.target_addr); // TODO remove this
info!("Target port: {}\n", config.port.unwrap_or(DEFAULT_TCP_PORT));
let tcp_stream = TcpStream::connect((
config.target_addr,

View File

@@ -1154,10 +1154,9 @@ impl<'a> TeletextContext<'a> {
}
if v >= 0x20 {
let u = char::from_u32(v as u32).unwrap();
let u = char::from_u32(v as u32).unwrap_or(char::REPLACEMENT_CHARACTER);
self.page_buffer_cur.get_or_insert("".into()).push(u);
if logger().expect("could not access logger").is_gui_mode() {
// For now we just handle the easy stuff
eprint!("{u}");
}
}
@@ -1225,13 +1224,15 @@ impl<'a> TeletextContext<'a> {
}
}
_ => {
ans = Some(Subtitle::new_text(
self.page_buffer_cur.take().unwrap().into(),
self.page_buffer.show_timestamp,
self.page_buffer.hide_timestamp + Timestamp::from_millis(1),
None,
"TLT".into(),
));
if let Some(cur) = self.page_buffer_cur.take() {
ans = Some(Subtitle::new_text(
cur.into(),
self.page_buffer.show_timestamp,
self.page_buffer.hide_timestamp + Timestamp::from_millis(1),
None,
"TLT".into(),
));
}
}
}
@@ -1251,34 +1252,43 @@ impl<'a> TeletextContext<'a> {
capitalization_list: &[String],
) {
// variable names conform to ETS 300 706, chapter 7.1.2
let address = (decode_hamming_8_4(packet.address[1]).unwrap() << 4)
| decode_hamming_8_4(packet.address[0]).unwrap();
let Some(addr1) = decode_hamming_8_4(packet.address[1]) else {
return;
};
let Some(addr0) = decode_hamming_8_4(packet.address[0]) else {
return;
};
let address = (addr1 << 4) | addr0;
let mut m = address & 0x7;
if m == 0 {
m = 8;
}
let y = (address >> 3) & 0x1f;
let designation_code = if y > 25 {
decode_hamming_8_4(packet.data[0]).unwrap()
decode_hamming_8_4(packet.data[0]).unwrap_or(0x00)
} else {
0x00
};
if y == 0 {
// CC map
let i = (decode_hamming_8_4(packet.data[1]).unwrap() << 4)
| decode_hamming_8_4(packet.data[0]).unwrap();
let flag_subtitle = (decode_hamming_8_4(packet.data[5]).unwrap() & 0x08) >> 3;
let h1 = decode_hamming_8_4(packet.data[1]).unwrap_or(0);
let h0 = decode_hamming_8_4(packet.data[0]).unwrap_or(0);
let i = (h1 << 4) | h0;
let flag_subtitle = (decode_hamming_8_4(packet.data[5]).unwrap_or(0) & 0x08) >> 3;
self.cc_map[i as usize] |= flag_subtitle << (m - 1);
let flag_subtitle = flag_subtitle != 0;
if flag_subtitle && (i < 0xff) {
let mut thisp = ((m as u32) << 8)
| ((decode_hamming_8_4(packet.data[1]).unwrap() as u32) << 4)
| (decode_hamming_8_4(packet.data[0]).unwrap() as u32);
let t1 = format!("{thisp:x}"); // Example: 1928 -> 788
thisp = t1.parse().unwrap();
let h1 = decode_hamming_8_4(packet.data[1]).unwrap_or(0) as u32;
let h0 = decode_hamming_8_4(packet.data[0]).unwrap_or(0) as u32;
let mut thisp = ((m as u32) << 8) | (h1 << 4) | h0;
let t1 = format!("{thisp:x}");
// Fallback to original value if parsing fails to avoid panics on malformed BCD
thisp = t1.parse().unwrap_or(thisp);
if !self.seen_sub_page[thisp as usize] {
self.seen_sub_page[thisp as usize] = true;
info!(
@@ -1288,36 +1298,28 @@ impl<'a> TeletextContext<'a> {
}
}
if (self.config.page.get() == 0.into()) && flag_subtitle && (i < 0xff) {
self.config.page.replace(
(((m as u16) << 8)
| ((decode_hamming_8_4(packet.data[1]).unwrap() as u16) << 4)
| (decode_hamming_8_4(packet.data[0]).unwrap() as u16))
.into(),
);
let h1 = decode_hamming_8_4(packet.data[1]).unwrap_or(0) as u16;
let h0 = decode_hamming_8_4(packet.data[0]).unwrap_or(0) as u16;
self.config
.page
.replace((((m as u16) << 8) | (h1 << 4) | h0).into());
info!("- No teletext page specified, first received suitable page is {}, not guaranteed\n", self.config.page.get());
}
// Page number and control bits
let page_number: TeletextPageNumber = (((m as u16) << 8)
| ((decode_hamming_8_4(packet.data[1]).unwrap() as u16) << 4)
| (decode_hamming_8_4(packet.data[0]).unwrap() as u16))
.into();
let charset = ((decode_hamming_8_4(packet.data[7]).unwrap() & 0x08)
| (decode_hamming_8_4(packet.data[7]).unwrap() & 0x04)
| (decode_hamming_8_4(packet.data[7]).unwrap() & 0x02))
>> 1;
// let flag_suppress_header = decode_hamming_8_4(packet.data[6]).unwrap() & 0x01;
// let flag_inhibit_display = (decode_hamming_8_4(packet.data[6]).unwrap() & 0x08) >> 3;
let h1 = decode_hamming_8_4(packet.data[1]).unwrap_or(0) as u16;
let h0 = decode_hamming_8_4(packet.data[0]).unwrap_or(0) as u16;
let page_number: TeletextPageNumber = (((m as u16) << 8) | (h1 << 4) | h0).into();
let c7 = decode_hamming_8_4(packet.data[7]).unwrap_or(0);
let charset = (c7 & 0x08 | c7 & 0x04 | c7 & 0x02) >> 1;
// ETS 300 706, chapter 9.3.1.3:
// When set to '1' the service is designated to be in Serial mode and the transmission of a page is terminated
// by the next page header with a different page number.
// When set to '0' the service is designated to be in Parallel mode and the transmission of a page is terminated
// by the next page header with a different page number but the same magazine number.
// The same setting shall be used for all page headers in the service.
// ETS 300 706, chapter 7.2.1: Page is terminated by and excludes the next page header packet
// having the same magazine address in parallel transmission mode, or any magazine address in serial transmission mode.
self.transmission_mode = if decode_hamming_8_4(packet.data[7]).unwrap() & 0x01 == 0 {
self.transmission_mode = if c7 & 0x01 == 0 {
TransmissionMode::Parallel
} else {
TransmissionMode::Serial
@@ -1353,19 +1355,17 @@ impl<'a> TeletextContext<'a> {
// Now we have the begining of page transmission; if there is page_buffer pending, process it
if self.page_buffer.tainted {
// Convert telx to UCS-2 before processing
for yt in 1..=23 {
for it in 0..40 {
if self.page_buffer.text[yt][it] != 0x00
&& !self.page_buffer.g2_char_present[yt][it]
{
self.page_buffer.text[yt][it] = self
.g0_charset
.ucs2_char(self.page_buffer.text[yt][it].try_into().unwrap());
if let Ok(c) = self.page_buffer.text[yt][it].try_into() {
self.page_buffer.text[yt][it] = self.g0_charset.ucs2_char(c);
}
}
}
}
// it would be nice, if subtitle hides on previous video frame, so we contract 40 ms (1 frame @25 fps)
self.page_buffer.hide_timestamp = timestamp - Timestamp::from_millis(40);
if self.page_buffer.hide_timestamp > timestamp {
self.page_buffer.hide_timestamp = Timestamp::from_millis(0);
@@ -1544,12 +1544,14 @@ impl<'a> TeletextContext<'a> {
info!("- Programme Identification Data = ");
for i in 20..40 {
let c = self.g0_charset.ucs2_char(packet.data[i]);
// strip any control codes from PID, eg. TVP station
if c < 0x20 {
continue;
}
info!("{}", char::from_u32(c as u32).unwrap());
info!(
"{}",
char::from_u32(c as u32).unwrap_or(char::REPLACEMENT_CHARACTER)
);
}
info!("\n");
@@ -1580,7 +1582,7 @@ impl<'a> TeletextContext<'a> {
info!(
"- Universal Time Co-ordinated = {}\n",
t0.to_ctime().unwrap()
t0.to_ctime().as_deref().unwrap_or("unknown")
);
debug!(msg_type = DebugMessageFlag::TELETEXT; "- Transmission mode = {:?}\n", self.transmission_mode);
@@ -1589,8 +1591,13 @@ impl<'a> TeletextContext<'a> {
&& matches!(self.config.date_format, TimestampFormat::Date { .. })
&& !self.config.noautotimeref
{
info!("- Broadcast Service Data Packet received, resetting UTC referential value to {}\n", t0.to_ctime().unwrap());
*UTC_REFVALUE.write().unwrap() = t as u64;
info!(
"- Broadcast Service Data Packet received, resetting UTC referential value to {}\n",
t0.to_ctime().as_deref().unwrap_or("unknown")
);
if let Ok(mut lock) = UTC_REFVALUE.write() {
*lock = t as u64;
}
self.states.pts_initialized = false;
}
@@ -1610,15 +1617,14 @@ impl<'a> TeletextContext<'a> {
if let Some(subtitles) = subtitles {
// output any pending close caption
if self.page_buffer.tainted {
// Convert telx to UCS-2 before processing
for yt in 1..=23 {
for it in 0..40 {
if self.page_buffer.text[yt][it] != 0x00
&& !self.page_buffer.g2_char_present[yt][it]
{
self.page_buffer.text[yt][it] = self
.g0_charset
.ucs2_char(self.page_buffer.text[yt][it].try_into().unwrap());
if let Ok(c) = self.page_buffer.text[yt][it].try_into() {
self.page_buffer.text[yt][it] = self.g0_charset.ucs2_char(c);
}
}
}
}

View File

@@ -225,9 +225,6 @@ impl Timestamp {
let m = millis / 60000 - 60 * h;
let s = millis / 1000 - 3600 * h - 60 * m;
let u = millis - 3600000 * h - 60000 * m - 1000 * s;
if h > 24 {
println!("{h}")
}
Ok((h.try_into()?, m as u8, s as u8, u as u16))
}

View File

@@ -269,6 +269,11 @@ impl<'a> CCExtractorLogger {
self.target
}
/// Sets the target for logging messages.
pub fn set_target(&mut self, target: OutputTarget) {
self.target = target;
}
/// Check if the messages are intercepted by GUI.
pub fn is_gui_mode(&self) -> bool {
self.gui_mode
@@ -276,8 +281,16 @@ impl<'a> CCExtractorLogger {
fn print(&self, args: &Arguments<'a>) {
match &self.target {
OutputTarget::Stdout => print!("{args}"),
OutputTarget::Stderr => eprint!("{args}"),
OutputTarget::Stdout => {
print!("{args}");
// Flush stdout to ensure output appears immediately, especially when
// mixing with C code that also writes to stdout
let _ = std::io::Write::flush(&mut std::io::stdout());
}
OutputTarget::Stderr => {
eprint!("{args}");
let _ = std::io::Write::flush(&mut std::io::stderr());
}
OutputTarget::Quiet => {}
}
}

View File

@@ -28,7 +28,7 @@ const BURNEDIN_SUBTITLE_EXTRACTION: &str = "Burned-in subtitle extraction";
#[derive(Debug, Parser)]
#[command(name = "CCExtractor")]
#[command(author = "Carlos Fernandez Sanz, Volker Quetschke.")]
#[command(version = "1.0")]
#[command(version = "0.96.5")]
#[command(about = "Teletext portions taken from Petr Kutalek's telxcc
--------------------------------------------------------------------------
Originally based on McPoodle's tools. Check his page for lots of information
@@ -227,7 +227,7 @@ pub struct Args {
/// "all[EUC-KR]") and it will encode specified charset to
/// UTF-8 using iconv. See iconv documentation to check if
/// required encoding/charset is supported.
#[arg(long="service", value_name="services", verbatim_doc_comment, help_heading=OPTION_AFFECT_PROCESSED)]
#[arg(long="service", alias="svc", value_name="services", verbatim_doc_comment, help_heading=OPTION_AFFECT_PROCESSED)]
pub cea708services: Option<String>,
/// With the exception of McPoodle's raw format, which is just the closed
/// caption data with no other info, CCExtractor can usually detect the
@@ -290,6 +290,18 @@ pub struct Args {
/// DVD Recorder)
#[arg(long="90090", verbatim_doc_comment, help_heading=OPTIONS_AFFECTING_INPUT_FILES)]
pub mpeg90090: bool,
/// Set the frame rate for SCC (Scenarist Closed Caption) input files.
/// Valid values: 29.97 (default), 24, 25, 30
/// Example: --scc-framerate 25
#[arg(long="scc-framerate", verbatim_doc_comment, value_name="fps", help_heading=OPTIONS_AFFECTING_INPUT_FILES)]
pub scc_framerate: Option<String>,
/// Enable bandwidth-aware timing for SCC output (issue #1120).
/// When enabled, captions are pre-loaded ahead of their display time
/// based on the EIA-608 transmission bandwidth (2 bytes/frame).
/// This ensures YouTube and broadcast compliance by preventing
/// caption collisions. Use this for professional SCC output.
#[arg(long="scc-accurate-timing", verbatim_doc_comment, help_heading=OPTIONS_AFFECTING_INPUT_FILES)]
pub scc_accurate_timing: bool,
/// By default, ccextractor will process input files in
/// sequence as if they were all one large file (i.e.
/// split by a generic, non video-aware tool. If you
@@ -390,10 +402,10 @@ pub struct Args {
/// reference to the received data. Use this parameter if
/// you prefer your own reference. Note: Current this only
/// affects Teletext in timed transcript with --datets.
#[arg(long, verbatim_doc_comment, help_heading=OPTIONS_AFFECTING_INPUT_FILES)]
#[arg(long, alias="noautotimeref", verbatim_doc_comment, help_heading=OPTIONS_AFFECTING_INPUT_FILES)]
pub no_autotimeref: bool,
/// Ignore SCTE-20 data if present.
#[arg(long, verbatim_doc_comment, help_heading=OPTIONS_AFFECTING_INPUT_FILES)]
#[arg(long, alias="noscte20", verbatim_doc_comment, help_heading=OPTIONS_AFFECTING_INPUT_FILES)]
pub no_scte20: bool,
/// Create a separate file for CSS instead of inline.
#[arg(long, verbatim_doc_comment, help_heading=OPTIONS_AFFECTING_INPUT_FILES)]
@@ -448,7 +460,7 @@ pub struct Args {
/// Do not append a BOM (Byte Order Mark) to output
/// files. Note that this may break files when using
/// Windows. This is the default in non-Windows builds.
#[arg(long, verbatim_doc_comment, conflicts_with="bom", help_heading=OUTPUT_AFFECTING_OUTPUT_FILES)]
#[arg(long, alias="nobom", verbatim_doc_comment, conflicts_with="bom", help_heading=OUTPUT_AFFECTING_OUTPUT_FILES)]
pub no_bom: bool,
/// Encode subtitles in Unicode instead of Latin-1.
#[arg(long, verbatim_doc_comment, help_heading=OUTPUT_AFFECTING_OUTPUT_FILES)]
@@ -481,7 +493,7 @@ pub struct Args {
pub defaultcolor: Option<String>,
/// Sentence capitalization. Use if you hate
/// ALL CAPS in subtitles.
#[arg(long, verbatim_doc_comment, help_heading=OUTPUT_AFFECTING_OUTPUT_FILES)]
#[arg(long, alias="sc", verbatim_doc_comment, help_heading=OUTPUT_AFFECTING_OUTPUT_FILES)]
pub sentencecap: bool,
/// Add the contents of 'file' to the list of words
/// that must be capitalized. For example, if file
@@ -625,6 +637,18 @@ pub struct Args {
/// bypassing hacks that are Tesseract-specific.
#[arg(long, verbatim_doc_comment, value_name="mode", help_heading=OUTPUT_AFFECTING_OUTPUT_FILES)]
pub psm: Option<u8>,
/// Split subtitle images into lines before OCR.
/// Uses PSM 7 (single text line mode) for each line,
/// which can improve accuracy for multi-line bitmap subtitles
/// (VOBSUB, DVD, DVB).
#[arg(long, verbatim_doc_comment, help_heading=OUTPUT_AFFECTING_OUTPUT_FILES)]
pub ocr_line_split: bool,
/// Disable the OCR character blacklist.
/// By default, CCExtractor blacklists characters like |, \, `, _
/// that are commonly misrecognized (e.g. 'I' as '|').
/// Use this flag to disable the blacklist.
#[arg(long, verbatim_doc_comment, help_heading=OUTPUT_AFFECTING_OUTPUT_FILES)]
pub no_ocr_blacklist: bool,
/// For MKV subtitles, select which language's caption
/// stream will be processed. e.g. 'eng' for English.
/// Language codes can be either the 3 letters bibliographic
@@ -677,7 +701,7 @@ pub struct Args {
/// If you hate the repeated lines caused by the roll-up
/// emulation, you can have ccextractor write only one
/// line at a time, getting rid of these repeated lines.
#[arg(long, verbatim_doc_comment, help_heading=OUTPUT_AFFECTING_BUFFERING)]
#[arg(long, alias="noru", verbatim_doc_comment, help_heading=OUTPUT_AFFECTING_BUFFERING)]
pub no_rollup: bool,
/// roll-up captions can consist of 2, 3 or 4 visible
/// lines at any time (the number of lines is part of
@@ -806,10 +830,10 @@ pub struct Args {
#[arg(long, verbatim_doc_comment, help_heading=OUTPUT_AFFECTING_DEBUG_DATA)]
pub parsedebug: bool,
/// Print Program Association Table dump.
#[arg(long="parsePAT", verbatim_doc_comment, help_heading=OUTPUT_AFFECTING_DEBUG_DATA)]
#[arg(long="parsePAT", alias="pat", verbatim_doc_comment, help_heading=OUTPUT_AFFECTING_DEBUG_DATA)]
pub parse_pat: bool,
/// Print Program Map Table dump.
#[arg(long="parsePMT", verbatim_doc_comment, help_heading=OUTPUT_AFFECTING_DEBUG_DATA)]
#[arg(long="parsePMT", alias="pmt", verbatim_doc_comment, help_heading=OUTPUT_AFFECTING_DEBUG_DATA)]
pub parse_pmt: bool,
/// Hex-dump defective TS packets.
#[arg(long, verbatim_doc_comment, help_heading=OUTPUT_AFFECTING_DEBUG_DATA)]
@@ -844,7 +868,7 @@ pub struct Args {
/// for video streams that have both teletext packets
/// and CEA-608/708 packets (if teletext is processed
/// then CEA-608/708 processing is disabled).
#[arg(long, verbatim_doc_comment, conflicts_with="teletext", help_heading=TELETEXT_OPTIONS)]
#[arg(long, alias="noteletext", verbatim_doc_comment, conflicts_with="teletext", help_heading=TELETEXT_OPTIONS)]
pub no_teletext: bool,
/// Use the passed format to customize the (Timed) Transcript
/// output. The format must be like this: 1100100 (7 digits).
@@ -973,6 +997,8 @@ pub enum InFormat {
Mkv,
/// Material Exchange Format (MXF).
Mxf,
/// Scenarist Closed Caption (SCC).
Scc,
#[cfg(feature = "wtv_debug")]
// For WTV Debug mode only
Hex,

View File

@@ -8,7 +8,9 @@ use crate::{anchor_hdcc, current_fps, process_hdcc, store_hdcc, MPEG_CLOCK_FREQ}
use lib_ccxr::common::AvcNalType;
use lib_ccxr::util::log::DebugMessageFlag;
use lib_ccxr::{debug, info};
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
use std::os::raw::c_void;
use std::slice;
@@ -453,7 +455,7 @@ pub fn hex_dump(data: &[u8]) {
// Print hex bytes
for byte in chunk {
print!("{:02X} ", byte);
print!("{byte:02X} ");
}
// Pad if less than 16 bytes

View File

@@ -21,6 +21,19 @@ pub unsafe extern "C" fn ccxr_process_avc(
return 0;
}
// In report-only mode (-out=report), enc_ctx is NULL because no encoder is created.
// Skip AVC processing in this case since we can't output captions without an encoder.
// Return the full buffer length to indicate we've "consumed" the data.
if enc_ctx.is_null() {
return avcbuflen;
}
// dec_ctx and sub should never be NULL in normal operation, but check defensively
if dec_ctx.is_null() || sub.is_null() {
info!("Warning: dec_ctx or sub is NULL in ccxr_process_avc");
return avcbuflen;
}
// Create a safe slice from the raw pointer
let avc_slice = std::slice::from_raw_parts_mut(avcbuf, avcbuflen);

View File

@@ -50,7 +50,7 @@ pub fn sei_message(ctx: &mut AvcContextRust, seibuf: &[u8]) -> usize {
return 0;
}
let mut payload_type = 0;
let mut payload_type: u32 = 0;
while seibuf_idx < seibuf.len() && seibuf[seibuf_idx] == 0xff {
payload_type += 255;
seibuf_idx += 1;
@@ -60,10 +60,10 @@ pub fn sei_message(ctx: &mut AvcContextRust, seibuf: &[u8]) -> usize {
return seibuf_idx;
}
payload_type += seibuf[seibuf_idx] as i32;
payload_type += seibuf[seibuf_idx] as u32;
seibuf_idx += 1;
let mut payload_size = 0;
let mut payload_size: u32 = 0;
while seibuf_idx < seibuf.len() && seibuf[seibuf_idx] == 0xff {
payload_size += 255;
seibuf_idx += 1;
@@ -73,7 +73,7 @@ pub fn sei_message(ctx: &mut AvcContextRust, seibuf: &[u8]) -> usize {
return seibuf_idx;
}
payload_size += seibuf[seibuf_idx] as i32;
payload_size += seibuf[seibuf_idx] as u32;
seibuf_idx += 1;
let mut broken = false;
@@ -226,12 +226,10 @@ pub fn user_data_registered_itu_t_t35(ctx: &mut AvcContextRust, userbuf: &[u8])
}
// Save the data and process once we know the sequence number
if ((ctx.cc_count as usize + local_cc_count) * 3) + 1 > ctx.cc_databufsize {
let required_size = ((ctx.cc_count as usize + local_cc_count) * 3) + 1;
if required_size > ctx.cc_data.len() {
let new_size = ((ctx.cc_count as usize + local_cc_count) * 6) + 1;
unsafe {
ctx.cc_data.set_len(new_size);
}
ctx.cc_data.reserve(new_size);
ctx.cc_data.resize(new_size, 0);
ctx.cc_databufsize = new_size;
}

View File

@@ -18,6 +18,7 @@ use lib_ccxr::common::DtvccServiceCharset;
use lib_ccxr::common::EncoderConfig;
use lib_ccxr::common::EncodersTranscriptFormat;
use lib_ccxr::common::Language;
use lib_ccxr::common::MkvLangFilter;
use lib_ccxr::common::Options;
use lib_ccxr::common::OutputFormat;
use lib_ccxr::common::SelectCodec;
@@ -181,9 +182,11 @@ pub unsafe fn copy_from_rust(ccx_s_options: *mut ccx_s_options, options: Options
(*ccx_s_options).ocr_oem = options.ocr_oem as _;
(*ccx_s_options).psm = options.psm as _;
(*ccx_s_options).ocr_quantmode = options.ocr_quantmode as _;
if let Some(mkvlang) = options.mkvlang {
(*ccx_s_options).ocr_line_split = options.ocr_line_split as _;
(*ccx_s_options).ocr_blacklist = options.ocr_blacklist as _;
if let Some(ref mkvlang) = options.mkvlang {
(*ccx_s_options).mkvlang =
replace_rust_c_string((*ccx_s_options).mkvlang, mkvlang.to_ctype().as_str());
replace_rust_c_string((*ccx_s_options).mkvlang, mkvlang.as_raw_str());
}
(*ccx_s_options).analyze_video_stream = options.analyze_video_stream as _;
(*ccx_s_options).hardsubx_ocr_mode = options.hardsubx_ocr_mode.to_ctype();
@@ -209,11 +212,9 @@ pub unsafe fn copy_from_rust(ccx_s_options: *mut ccx_s_options, options: Options
replace_rust_c_string((*ccx_s_options).udpaddr, &options.udpaddr.clone().unwrap());
}
(*ccx_s_options).udpport = options.udpport as _;
if options.tcpport.is_some() {
(*ccx_s_options).tcpport = replace_rust_c_string(
(*ccx_s_options).tcpport,
&options.tcpport.unwrap().to_string(),
);
if let Some(tcpport) = options.tcpport {
(*ccx_s_options).tcpport =
replace_rust_c_string((*ccx_s_options).tcpport, &tcpport.to_string());
}
if options.tcp_password.is_some() {
(*ccx_s_options).tcp_password = replace_rust_c_string(
@@ -233,11 +234,9 @@ pub unsafe fn copy_from_rust(ccx_s_options: *mut ccx_s_options, options: Options
&options.srv_addr.clone().unwrap(),
);
}
if options.srv_port.is_some() {
(*ccx_s_options).srv_port = replace_rust_c_string(
(*ccx_s_options).srv_port,
&options.srv_port.unwrap().to_string(),
);
if let Some(srv_port) = options.srv_port {
(*ccx_s_options).srv_port =
replace_rust_c_string((*ccx_s_options).srv_port, &srv_port.to_string());
}
(*ccx_s_options).noautotimeref = options.noautotimeref as _;
(*ccx_s_options).input_source = options.input_source as _;
@@ -251,15 +250,12 @@ pub unsafe fn copy_from_rust(ccx_s_options: *mut ccx_s_options, options: Options
// Subsequent calls from ccxr_demuxer_open/close should NOT modify inputfile because
// C code holds references to those strings throughout processing.
// Freeing them would cause use-after-free and double-free errors.
if options.inputfile.is_some() && (*ccx_s_options).inputfile.is_null() {
(*ccx_s_options).inputfile = string_to_c_chars(options.inputfile.clone().unwrap());
(*ccx_s_options).num_input_files = options
.inputfile
.as_ref()
.unwrap()
.iter()
.filter(|s| !s.is_empty())
.count() as _;
if let Some(ref inputfile) = options.inputfile {
if (*ccx_s_options).inputfile.is_null() {
(*ccx_s_options).inputfile = string_to_c_chars(inputfile.clone());
(*ccx_s_options).num_input_files =
inputfile.iter().filter(|s| !s.is_empty()).count() as _;
}
}
(*ccx_s_options).demux_cfg = options.demux_cfg.to_ctype();
// Only set enc_cfg on the first call (when output_filename is null).
@@ -275,6 +271,10 @@ pub unsafe fn copy_from_rust(ccx_s_options: *mut ccx_s_options, options: Options
(*ccx_s_options).multiprogram = options.multiprogram as _;
(*ccx_s_options).out_interval = options.out_interval;
(*ccx_s_options).segment_on_key_frames_only = options.segment_on_key_frames_only as _;
(*ccx_s_options).scc_framerate = options.scc_framerate;
// Also copy to enc_cfg so the encoder uses the same frame rate for SCC output
(*ccx_s_options).enc_cfg.scc_framerate = options.scc_framerate;
(*ccx_s_options).enc_cfg.scc_accurate_timing = options.scc_accurate_timing.into();
#[cfg(feature = "with_libcurl")]
{
if options.curlposturl.is_some() {
@@ -416,13 +416,13 @@ pub unsafe fn copy_to_rust(ccx_s_options: *const ccx_s_options) -> Options {
options.ocr_oem = (*ccx_s_options).ocr_oem as i8;
options.psm = (*ccx_s_options).psm;
options.ocr_quantmode = (*ccx_s_options).ocr_quantmode as u8;
options.ocr_line_split = (*ccx_s_options).ocr_line_split != 0;
options.ocr_blacklist = (*ccx_s_options).ocr_blacklist != 0;
// Handle mkvlang (C string to Option<Language>)
// Handle mkvlang (C string to Option<MkvLangFilter>)
if !(*ccx_s_options).mkvlang.is_null() {
options.mkvlang = Some(
Language::from_str(&c_char_to_string((*ccx_s_options).mkvlang))
.expect("Invalid language"),
)
let lang_str = c_char_to_string((*ccx_s_options).mkvlang);
options.mkvlang = MkvLangFilter::new(&lang_str).ok();
}
options.analyze_video_stream = (*ccx_s_options).analyze_video_stream != 0;
@@ -531,6 +531,8 @@ pub unsafe fn copy_to_rust(ccx_s_options: *const ccx_s_options) -> Options {
options.multiprogram = (*ccx_s_options).multiprogram != 0;
options.out_interval = (*ccx_s_options).out_interval;
options.segment_on_key_frames_only = (*ccx_s_options).segment_on_key_frames_only != 0;
options.scc_framerate = (*ccx_s_options).scc_framerate;
options.scc_accurate_timing = (*ccx_s_options).enc_cfg.scc_accurate_timing != 0;
// Handle optional features with conditional compilation
#[cfg(feature = "with_libcurl")]
@@ -873,6 +875,7 @@ impl CType<u32> for StreamMode {
StreamMode::Gxf => ccx_stream_mode_enum_CCX_SM_GXF as _,
StreamMode::Mkv => ccx_stream_mode_enum_CCX_SM_MKV as _,
StreamMode::Mxf => ccx_stream_mode_enum_CCX_SM_MXF as _,
StreamMode::Scc => ccx_stream_mode_enum_CCX_SM_SCC as _,
StreamMode::Autodetect => ccx_stream_mode_enum_CCX_SM_AUTODETECT as _,
_ => ccx_stream_mode_enum_CCX_SM_ELEMENTARY_OR_NOT_FOUND as _,
}
@@ -972,6 +975,8 @@ impl CType<encoder_cfg> for EncoderConfig {
null_pointer()
},
extract_only_708: self.extract_only_708 as _,
scc_framerate: 0, // Will be set from ccx_options.scc_framerate in copy_to_c
scc_accurate_timing: 0, // Will be set from ccx_options.scc_accurate_timing in copy_to_c
}
}
}
@@ -1064,7 +1069,6 @@ impl CType<program_info> for ProgramInfo {
program_info {
pid: self.pid,
program_number: self.program_number,
initialized_ocr: self.initialized_ocr as c_int,
_bitfield_align_1: [],
_bitfield_1: bf1,
version: self.version,

Some files were not shown because too many files have changed in this diff Show More