fix(rust): Support BCP 47 language tags in --mkvlang option

The --mkvlang option previously only supported single ISO 639-2 codes due to using a Language enum with a fixed list of variants. Extended codes (like "fre-ca") and multiple codes (like "eng,chi") would panic. This change introduces MkvLangFilter, a proper type for language filtering that: - Validates language codes per BCP 47 specification - Supports ISO 639-2 (3-letter codes like "eng") - Supports BCP 47 tags (like "en-US", "zh-Hans-CN") - Supports comma-separated multiple codes - Provides clean error messages for invalid input - Includes comprehensive unit tests The C code continues to receive the raw string for strstr() matching, maintaining backward compatibility. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-04 05:26:31 +00:00 · 2026-01-18 13:23:39 -08:00
parent 74e64c0421
commit 91d3512bcc
6 changed files with 407 additions and 30 deletions
--- a/docs/CHANGES.TXT
+++ b/docs/CHANGES.TXT
@@ -4,6 +4,7 @@
 - Fix: Prevent infinite loop on truncated MKV files
 - Fix: Various memory safety and stability fixes in demuxers (MP4, PS, MKV, DVB)
 - Fix: Delete empty output files instead of leaving 0-byte files (#1282)
+- Fix: --mkvlang now supports BCP 47 language tags (e.g., en-US, zh-Hans-CN) and multiple codes

 0.96.5 (2026-01-05)
 -------------------
--- a/src/rust/lib_ccxr/src/common/mkv_lang.rs
+++ b/src/rust/lib_ccxr/src/common/mkv_lang.rs
@@ -0,0 +1,388 @@
+//! MKV language filtering support.
+//!
+//! Matroska files support two language code formats:
+//! - ISO 639-2 (3-letter bibliographic codes): "eng", "fre", "chi"
+//! - BCP 47 / IETF language tags: "en-US", "fr-CA", "zh-Hans"
+//!
+//! This module provides [`MkvLangFilter`] for parsing and matching language codes.
+
+use std::fmt;
+use std::str::FromStr;
+
+/// A filter for matching MKV track languages.
+///
+/// Supports comma-separated lists of language codes in either:
+/// - ISO 639-2 format (3-letter codes like "eng", "fre")
+/// - BCP 47 format (tags like "en-US", "fr-CA", "zh-Hans")
+///
+/// # Examples
+///
+/// ```
+/// use lib_ccxr::common::MkvLangFilter;
+///
+/// // Single language
+/// let filter: MkvLangFilter = "eng".parse().unwrap();
+/// assert!(filter.matches("eng", None));
+///
+/// // Multiple languages
+/// let filter: MkvLangFilter = "eng,fre,chi".parse().unwrap();
+/// assert!(filter.matches("fre", None));
+///
+/// // BCP 47 matching
+/// let filter: MkvLangFilter = "en-US,fr-CA".parse().unwrap();
+/// assert!(filter.matches("eng", Some("en-US")));
+/// ```
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct MkvLangFilter {
+    /// The original input string (used for C FFI)
+    raw: String,
+    /// Parsed and validated language codes
+    codes: Vec<LanguageCode>,
+}
+
+/// A single language code, either ISO 639-2 or BCP 47.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct LanguageCode {
+    /// The normalized (lowercase) code
+    code: String,
+}
+
+/// Error type for invalid language codes.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct InvalidLanguageCode {
+    /// The invalid code
+    pub code: String,
+    /// Description of what's wrong
+    pub reason: &'static str,
+}
+
+impl fmt::Display for InvalidLanguageCode {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "invalid language code '{}': {}", self.code, self.reason)
+    }
+}
+
+impl std::error::Error for InvalidLanguageCode {}
+
+impl LanguageCode {
+    /// Validates and creates a new language code.
+    ///
+    /// Accepts:
+    /// - ISO 639-2 codes: 3 ASCII letters (e.g., "eng", "fre")
+    /// - BCP 47 tags: primary language with optional subtags separated by hyphens
+    ///   (e.g., "en-US", "fr-CA", "zh-Hans-CN")
+    ///
+    /// # BCP 47 Structure
+    /// - Primary language: 2-3 letters
+    /// - Script (optional): 4 letters (e.g., "Hans", "Latn")
+    /// - Region (optional): 2 letters or 3 digits (e.g., "US", "419")
+    /// - Variant (optional): 5-8 alphanumeric characters
+    pub fn new(code: &str) -> Result<Self, InvalidLanguageCode> {
+        let code = code.trim();
+
+        if code.is_empty() {
+            return Err(InvalidLanguageCode {
+                code: code.to_string(),
+                reason: "empty language code",
+            });
+        }
+
+        // Check for valid characters (alphanumeric and hyphens only)
+        if !code.chars().all(|c| c.is_ascii_alphanumeric() || c == '-') {
+            return Err(InvalidLanguageCode {
+                code: code.to_string(),
+                reason: "must contain only ASCII letters, digits, and hyphens",
+            });
+        }
+
+        // Cannot start or end with hyphen
+        if code.starts_with('-') || code.ends_with('-') {
+            return Err(InvalidLanguageCode {
+                code: code.to_string(),
+                reason: "cannot start or end with hyphen",
+            });
+        }
+
+        // Cannot have consecutive hyphens
+        if code.contains("--") {
+            return Err(InvalidLanguageCode {
+                code: code.to_string(),
+                reason: "cannot have consecutive hyphens",
+            });
+        }
+
+        // Validate subtag structure
+        let subtags: Vec<&str> = code.split('-').collect();
+
+        // First subtag must be the primary language (2-3 letters)
+        let primary = subtags[0];
+        if primary.len() < 2 || primary.len() > 3 {
+            return Err(InvalidLanguageCode {
+                code: code.to_string(),
+                reason: "primary language subtag must be 2-3 letters",
+            });
+        }
+        if !primary.chars().all(|c| c.is_ascii_alphabetic()) {
+            return Err(InvalidLanguageCode {
+                code: code.to_string(),
+                reason: "primary language subtag must contain only letters",
+            });
+        }
+
+        // Validate subsequent subtags
+        for subtag in subtags.iter().skip(1) {
+            if subtag.is_empty() {
+                return Err(InvalidLanguageCode {
+                    code: code.to_string(),
+                    reason: "empty subtag",
+                });
+            }
+
+            let len = subtag.len();
+            let all_alpha = subtag.chars().all(|c| c.is_ascii_alphabetic());
+            let all_digit = subtag.chars().all(|c| c.is_ascii_digit());
+            let all_alnum = subtag.chars().all(|c| c.is_ascii_alphanumeric());
+
+            // Valid subtag types:
+            // - Script: 4 letters (e.g., "Hans")
+            // - Region: 2 letters or 3 digits (e.g., "US", "419")
+            // - Variant: 5-8 alphanumeric, or 4 starting with digit
+            // - Extension: single letter followed by more subtags
+            // - Private use: 'x' followed by 1-8 char subtags
+            let valid = match len {
+                1 => subtag.chars().all(|c| c.is_ascii_alphanumeric()), // Extension singleton
+                2 => all_alpha,                                         // Region (2 letters)
+                3 => all_alpha || all_digit,                            // 3 letters or 3 digits
+                4 => all_alpha || (subtag.chars().next().unwrap().is_ascii_digit() && all_alnum), // Script or variant starting with digit
+                5..=8 => all_alnum,                                     // Variant
+                _ => false,
+            };
+
+            if !valid {
+                return Err(InvalidLanguageCode {
+                    code: code.to_string(),
+                    reason: "invalid subtag format",
+                });
+            }
+        }
+
+        Ok(Self {
+            code: code.to_lowercase(),
+        })
+    }
+
+    /// Returns the normalized (lowercase) code.
+    pub fn as_str(&self) -> &str {
+        &self.code
+    }
+
+    /// Checks if this code matches a track's language.
+    ///
+    /// Matching rules:
+    /// 1. Exact match (case-insensitive)
+    /// 2. Prefix match for BCP 47 (e.g., "en" matches "en-US")
+    pub fn matches(&self, iso639: &str, bcp47: Option<&str>) -> bool {
+        let iso639_lower = iso639.to_lowercase();
+        let bcp47_lower = bcp47.map(|s| s.to_lowercase());
+
+        // Exact match on ISO 639-2
+        if self.code == iso639_lower {
+            return true;
+        }
+
+        // Exact match on BCP 47
+        if let Some(ref bcp) = bcp47_lower {
+            if self.code == *bcp {
+                return true;
+            }
+        }
+
+        // Prefix match: "en" matches "en-US", "eng" matches track with bcp47 "en-US"
+        // The filter code could be a prefix of the track's BCP 47 tag
+        if let Some(ref bcp) = bcp47_lower {
+            if bcp.starts_with(&self.code) && bcp[self.code.len()..].starts_with('-') {
+                return true;
+            }
+            // Or the track's BCP 47 could be a prefix of the filter
+            if self.code.starts_with(bcp.as_str())
+                && self.code[bcp.len()..].starts_with('-')
+            {
+                return true;
+            }
+        }
+
+        false
+    }
+}
+
+impl FromStr for LanguageCode {
+    type Err = InvalidLanguageCode;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        Self::new(s)
+    }
+}
+
+impl fmt::Display for LanguageCode {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{}", self.code)
+    }
+}
+
+impl MkvLangFilter {
+    /// Creates a new filter from a comma-separated list of language codes.
+    pub fn new(input: &str) -> Result<Self, InvalidLanguageCode> {
+        let input = input.trim();
+        if input.is_empty() {
+            return Err(InvalidLanguageCode {
+                code: String::new(),
+                reason: "empty language filter",
+            });
+        }
+
+        let codes: Result<Vec<LanguageCode>, _> =
+            input.split(',').map(LanguageCode::new).collect();
+
+        Ok(Self {
+            raw: input.to_string(),
+            codes: codes?,
+        })
+    }
+
+    /// Returns the raw input string (for C FFI compatibility).
+    pub fn as_raw_str(&self) -> &str {
+        &self.raw
+    }
+
+    /// Returns the parsed language codes.
+    pub fn codes(&self) -> &[LanguageCode] {
+        &self.codes
+    }
+
+    /// Checks if any of the filter's codes match a track's language.
+    ///
+    /// # Arguments
+    /// - `iso639`: The track's ISO 639-2 language code (e.g., "eng")
+    /// - `bcp47`: The track's BCP 47 language tag, if available (e.g., "en-US")
+    pub fn matches(&self, iso639: &str, bcp47: Option<&str>) -> bool {
+        self.codes.iter().any(|code| code.matches(iso639, bcp47))
+    }
+}
+
+impl FromStr for MkvLangFilter {
+    type Err = InvalidLanguageCode;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        Self::new(s)
+    }
+}
+
+impl fmt::Display for MkvLangFilter {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{}", self.raw)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_iso639_codes() {
+        // Valid 3-letter codes
+        assert!(LanguageCode::new("eng").is_ok());
+        assert!(LanguageCode::new("fre").is_ok());
+        assert!(LanguageCode::new("chi").is_ok());
+        assert!(LanguageCode::new("ENG").is_ok()); // Case insensitive
+
+        // 2-letter codes (ISO 639-1 style, valid in BCP 47)
+        assert!(LanguageCode::new("en").is_ok());
+        assert!(LanguageCode::new("fr").is_ok());
+    }
+
+    #[test]
+    fn test_bcp47_codes() {
+        // Language + region
+        assert!(LanguageCode::new("en-US").is_ok());
+        assert!(LanguageCode::new("fr-CA").is_ok());
+        assert!(LanguageCode::new("pt-BR").is_ok());
+
+        // Language + script
+        assert!(LanguageCode::new("zh-Hans").is_ok());
+        assert!(LanguageCode::new("zh-Hant").is_ok());
+        assert!(LanguageCode::new("sr-Latn").is_ok());
+
+        // Language + script + region
+        assert!(LanguageCode::new("zh-Hans-CN").is_ok());
+        assert!(LanguageCode::new("zh-Hant-TW").is_ok());
+
+        // UN M.49 numeric region codes
+        assert!(LanguageCode::new("es-419").is_ok()); // Latin America
+    }
+
+    #[test]
+    fn test_invalid_codes() {
+        // Too short
+        assert!(LanguageCode::new("a").is_err());
+
+        // Invalid characters
+        assert!(LanguageCode::new("en_US").is_err()); // Underscore not allowed
+        assert!(LanguageCode::new("en US").is_err()); // Space not allowed
+        assert!(LanguageCode::new("ça").is_err());    // Non-ASCII
+
+        // Invalid structure
+        assert!(LanguageCode::new("-en").is_err());   // Leading hyphen
+        assert!(LanguageCode::new("en-").is_err());   // Trailing hyphen
+        assert!(LanguageCode::new("en--US").is_err()); // Double hyphen
+
+        // Empty
+        assert!(LanguageCode::new("").is_err());
+    }
+
+    #[test]
+    fn test_filter_multiple_codes() {
+        let filter = MkvLangFilter::new("eng,fre,chi").unwrap();
+        assert_eq!(filter.codes().len(), 3);
+        assert!(filter.matches("eng", None));
+        assert!(filter.matches("fre", None));
+        assert!(filter.matches("chi", None));
+        assert!(!filter.matches("spa", None));
+    }
+
+    #[test]
+    fn test_filter_bcp47_matching() {
+        let filter = MkvLangFilter::new("en-US,fr-CA").unwrap();
+
+        // Exact BCP 47 match
+        assert!(filter.matches("eng", Some("en-US")));
+        assert!(filter.matches("fre", Some("fr-CA")));
+
+        // No match
+        assert!(!filter.matches("eng", Some("en-GB")));
+        assert!(!filter.matches("eng", None));
+    }
+
+    #[test]
+    fn test_filter_mixed_formats() {
+        let filter = MkvLangFilter::new("eng,fr-CA,zh-Hans").unwrap();
+
+        assert!(filter.matches("eng", None));
+        assert!(filter.matches("fre", Some("fr-CA")));
+        assert!(filter.matches("chi", Some("zh-Hans")));
+    }
+
+    #[test]
+    fn test_case_insensitivity() {
+        let filter = MkvLangFilter::new("ENG,FR-CA").unwrap();
+        assert!(filter.matches("eng", None));
+        assert!(filter.matches("ENG", None));
+        assert!(filter.matches("fre", Some("fr-ca")));
+        assert!(filter.matches("FRE", Some("FR-CA")));
+    }
+
+    #[test]
+    fn test_raw_string_preserved() {
+        let filter = MkvLangFilter::new("eng,fre").unwrap();
+        assert_eq!(filter.as_raw_str(), "eng,fre");
+    }
+}
--- a/src/rust/lib_ccxr/src/common/mod.rs
+++ b/src/rust/lib_ccxr/src/common/mod.rs
@@ -18,8 +18,10 @@

 mod bitstream;
 mod constants;
+mod mkv_lang;
 mod options;

 pub use bitstream::*;
 pub use constants::*;
+pub use mkv_lang::*;
 pub use options::*;
--- a/src/rust/lib_ccxr/src/common/options.rs
+++ b/src/rust/lib_ccxr/src/common/options.rs
@@ -466,8 +466,9 @@ pub struct Options {
    pub ocr_line_split: bool,
    /// If true, use character blacklist to prevent common OCR errors (e.g. | vs I)
    pub ocr_blacklist: bool,
-    /// The name of the language stream for MKV
-    pub mkvlang: Option<Language>,
+    /// Language filter for MKV subtitle tracks.
+    /// Accepts comma-separated ISO 639-2 codes (e.g., "eng,fre") or BCP 47 tags (e.g., "en-US,fr-CA").
+    pub mkvlang: Option<super::MkvLangFilter>,
    /// If true, the video stream will be processed even if we're using a different one for subtitles.
    pub analyze_video_stream: bool,

--- a/src/rust/src/common.rs
+++ b/src/rust/src/common.rs
@@ -18,6 +18,7 @@ use lib_ccxr::common::DtvccServiceCharset;
 use lib_ccxr::common::EncoderConfig;
 use lib_ccxr::common::EncodersTranscriptFormat;
 use lib_ccxr::common::Language;
+use lib_ccxr::common::MkvLangFilter;
 use lib_ccxr::common::Options;
 use lib_ccxr::common::OutputFormat;
 use lib_ccxr::common::SelectCodec;
@@ -183,9 +184,9 @@ pub unsafe fn copy_from_rust(ccx_s_options: *mut ccx_s_options, options: Options
    (*ccx_s_options).ocr_quantmode = options.ocr_quantmode as _;
    (*ccx_s_options).ocr_line_split = options.ocr_line_split as _;
    (*ccx_s_options).ocr_blacklist = options.ocr_blacklist as _;
-    if let Some(mkvlang) = options.mkvlang {
+    if let Some(ref mkvlang) = options.mkvlang {
        (*ccx_s_options).mkvlang =
-            replace_rust_c_string((*ccx_s_options).mkvlang, mkvlang.to_ctype().as_str());
+            replace_rust_c_string((*ccx_s_options).mkvlang, mkvlang.as_raw_str());
    }
    (*ccx_s_options).analyze_video_stream = options.analyze_video_stream as _;
    (*ccx_s_options).hardsubx_ocr_mode = options.hardsubx_ocr_mode.to_ctype();
@@ -425,12 +426,10 @@ pub unsafe fn copy_to_rust(ccx_s_options: *const ccx_s_options) -> Options {
    options.ocr_line_split = (*ccx_s_options).ocr_line_split != 0;
    options.ocr_blacklist = (*ccx_s_options).ocr_blacklist != 0;

-    // Handle mkvlang (C string to Option<Language>)
+    // Handle mkvlang (C string to Option<MkvLangFilter>)
    if !(*ccx_s_options).mkvlang.is_null() {
-        options.mkvlang = Some(
-            Language::from_str(&c_char_to_string((*ccx_s_options).mkvlang))
-                .expect("Invalid language"),
-        )
+        let lang_str = c_char_to_string((*ccx_s_options).mkvlang);
+        options.mkvlang = MkvLangFilter::new(&lang_str).ok();
    }

    options.analyze_video_stream = (*ccx_s_options).analyze_video_stream != 0;
--- a/src/rust/src/parser.rs
+++ b/src/rust/src/parser.rs
@@ -133,24 +133,6 @@ fn process_word_file(filename: &str, list: &mut Vec<String>) -> Result<(), std::
    }
    Ok(())
 }
-fn mkvlang_params_check(lang: &str) {
-    for part in lang.split(',') {
-        let count = part.chars().count();
-        if !(3..=6).contains(&count) {
-            fatal!(
-                cause = ExitCause::MalformedParameter;
-                "language codes should be xxx,xxx,xxx,....\n"
-            );
-        }
-
-        if count == 6 && !part.contains('-') {
-            fatal!(
-                cause = ExitCause::MalformedParameter;
-                "last language code is not of the form xxx-xx\n"
-            );
-        }
-    }
-}

 fn get_file_buffer_size() -> i32 {
    unsafe { FILEBUFFERSIZE }
@@ -769,9 +751,13 @@ impl OptionsExt for Options {
        }

        if let Some(ref lang) = args.mkvlang {
-            self.mkvlang = Some(Language::from_str(lang.as_str()).unwrap());
-            let str = lang.as_str();
-            mkvlang_params_check(str);
+            match MkvLangFilter::new(lang.as_str()) {
+                Ok(filter) => self.mkvlang = Some(filter),
+                Err(e) => fatal!(
+                    cause = ExitCause::MalformedParameter;
+                    "{}\n", e
+                ),
+            }
        }
        if args.srt
            || args.mcc