import os
from pathlib import Path

def collect_bible_data(maya_root, spanish_root, out_src, out_tgt):
    maya_root = Path(maya_root).expanduser()
    spanish_root = Path(spanish_root).expanduser()
    out_src = Path(out_src).expanduser()
    out_tgt = Path(out_tgt).expanduser()

    print(f"Looking in Maya dir: {maya_root}")
    print(f"Looking in Spanish dir: {spanish_root}")

    if not maya_root.exists():
        print("Maya root directory does not exist!")
        return
    if not spanish_root.exists():
        print("Spanish root directory does not exist!")
        return

    with out_src.open('w', encoding='utf-8') as f_src, out_tgt.open('w', encoding='utf-8') as f_tgt:
        for book_dir in sorted(maya_root.iterdir()):
            if not book_dir.is_dir():
                continue

            maya_book = book_dir
            spanish_book = spanish_root / book_dir.name

            if not spanish_book.is_dir():
                print(f"Spanish directory missing: {spanish_book}")
                continue

            # Collect chapter base names
            maya_chapters = set()
            for file in maya_book.glob("*.txt"):
                name = file.stem
                if name.endswith("_REDO"):
                    base = name[:-5]
                else:
                    base = name
                maya_chapters.add(base)

            for base in sorted(maya_chapters):
                # Prefer REDO version
                maya_file = maya_book / f"{base}_REDO.txt"
                if not maya_file.exists():
                    maya_file = maya_book / f"{base}.txt"

                spanish_file = spanish_book / f"{base}_es.txt"

                if maya_file.exists() and spanish_file.exists():
                    maya_lines = maya_file.read_text(encoding='utf-8').splitlines()
                    spanish_lines = spanish_file.read_text(encoding='utf-8').splitlines()

                    if len(maya_lines) != len(spanish_lines):
                        print(f"Line mismatch in {book_dir.name}/{base}: {len(maya_lines)} vs {len(spanish_lines)}")
                        continue

                    for m_line, s_line in zip(maya_lines, spanish_lines):
                        f_src.write(m_line.strip() + '\n')
                        f_tgt.write(s_line.strip() + '\n')
                else:
                    print(f"Missing file(s) for {book_dir.name}/{base}")

    print(f"\nBible concatenation complete.\nSource: {out_src}\nTarget: {out_tgt}")


collect_bible_data(
    maya_root="~/bible_data/jwBibleMaya",
    spanish_root="~/bible_data/jwBibleSpanish",
    out_src="~/full_bible_yua.txt",
    out_tgt="~/full_bible_es.txt"
)
