# # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this file. If not, see . # # Copyright © 2021-2023 The University of Tromsø & # the Norwegian Sámi Parliament # http://giellatekno.uit.no & http://divvun.no # """Turn cesDoc xml into our homegrown xml.""" import argparse import glob import os from datetime import datetime from lxml import etree from corpustools import argparse_version, corpuspath, util def parse_options(): """Parse the options for this script.""" parser = argparse.ArgumentParser( parents=[argparse_version.parser], description="Turn cesDoc xml into our homegrown xml.", ) parser.add_argument("lang", help="Language of the file") parser.add_argument("testament", choices=["ot", "nt"], help="Old or new testament") parser.add_argument("cesdoc", help="The cesdoc that should be converted") return parser.parse_args() def main(): """Turn cesDoc to homegrown xml.""" args = parse_options() tree = etree.parse(args.cesdoc) chapter_paths = ( save_chapter( args.lang, args.testament, f"{bookindex:0>2}_{chapterindex:0>3}", get_verses(chapter), os.path.basename(args.cesdoc), ) for (bookindex, book) in enumerate(tree.xpath(".//div[@type='book']"), start=1) for (chapterindex, chapter) in enumerate( book.xpath(".//div[@type='chapter']"), start=1 ) ) set_parallels(chapter_paths, args.testament, args.lang) def get_verses(chapter): """Extract the chapter content.""" body = etree.Element("body") for seg in chapter.iter("seg"): verse = etree.SubElement(body, "verse") verse.set("number", seg.get("id").split(".")[-1]) verse.text = seg.text.strip() return body def save_chapter(language, testament, filename, body, address): """Save chapter info.""" language_year = {"nob": 2011, "sme": 2019.0} name = os.path.join( os.getenv("GTBOUND"), "orig", language, "bible", testament, "bibel.no", f"{filename}.xml", ) with util.ignored(OSError): os.makedirs(os.path.dirname(name)) path = corpuspath.make_corpus_path(name) path.metadata.set_variable("filename", address) path.metadata.set_variable("mainlang", language) path.metadata.set_variable("genre", "bible") path.metadata.set_variable("monolingual", "1") path.metadata.set_variable("license_type", "standard") path.metadata.set_variable("year", language_year.get(language, datetime.now().year)) path.metadata.write_file() root = etree.Element("document") root.append(body) with open(name, "wb") as page_stream: page_stream.write(etree.tostring(root, encoding="utf8", pretty_print=True)) return path def set_parallels(chapter_paths, testament, new_lang): """Set the parallels. Use the nob names as the base, it has all the books and chapters. """ nob_names = sorted( glob.glob( f'{os.path.join(os.getenv("GTBOUND"), "orig/nob/bible", testament, "bibel.no")}/*.xml' ) ) for chapter_path, nob_name in zip(chapter_paths, nob_names, strict=False): nob_path = corpuspath.make_corpus_path(nob_name) nob_meta = nob_path.metadata chapter_meta = chapter_path.metadata chapter_meta.set_parallel_text("nob", os.path.basename(nob_name)) nob_meta.set_parallel_text(new_lang, os.path.basename(chapter_path.orig)) nob_meta.write_file() for lang, filename in nob_meta.get_parallel_texts().items(): chapter_meta.set_parallel_text(lang, filename) parallel_path = corpuspath.make_corpus_path(nob_path.parallel(lang)) parallel_path.metadata.set_parallel_text( new_lang, os.path.basename(chapter_path.orig) ) parallel_path.metadata.write_file() chapter_meta.write_file()