From f9a0b6abf8871de1fe12e0df5f8a3e9769af0e3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toma=C5=BE=20Erjavec?= Date: Wed, 16 Oct 2024 19:10:13 +0200 Subject: [PATCH] Add potential new corpora (DE, ES-AN, SK). --- Build/Scripts/mt-prepare4mt.xsl | 3 +++ Scripts/parlamint-add-common-content.xsl | 13 +++++++++++++ Scripts/parlamint-factorize-corpora.pl | 15 ++++++++++----- Scripts/parlamint2conllu.pl | 5 ++++- Scripts/parlamint2distro.pl | 17 +++++++++++------ Scripts/parlamintp2conllu.pl | 5 ++++- 6 files changed, 45 insertions(+), 13 deletions(-) diff --git a/Build/Scripts/mt-prepare4mt.xsl b/Build/Scripts/mt-prepare4mt.xsl index 729a54ded..f303e2108 100644 --- a/Build/Scripts/mt-prepare4mt.xsl +++ b/Build/Scripts/mt-prepare4mt.xsl @@ -82,7 +82,9 @@ zls bg cs + gmw da + roa itc grk @@ -98,6 +100,7 @@ zls sv sla + sla tr sla diff --git a/Scripts/parlamint-add-common-content.xsl b/Scripts/parlamint-add-common-content.xsl index ffda50ddf..77731cb76 100755 --- a/Scripts/parlamint-add-common-content.xsl +++ b/Scripts/parlamint-add-common-content.xsl @@ -110,6 +110,11 @@ Bicameralism Lower house + + Legislature + Bicameralism + Lower house + Legislature Unicameralism @@ -123,6 +128,10 @@ Bicameralism Lower house + + Legislature + Unicameralism + Legislature Unicameralism @@ -229,6 +238,10 @@ Bicameralism Lower house + + Legislature + Unicameralism + Legislature Unicameralism diff --git a/Scripts/parlamint-factorize-corpora.pl b/Scripts/parlamint-factorize-corpora.pl index 7f00b336b..69fe3b239 100755 --- a/Scripts/parlamint-factorize-corpora.pl +++ b/Scripts/parlamint-factorize-corpora.pl @@ -22,15 +22,17 @@ # Mapping of countries to languages, we need it for mapping of common taxonomies $country2lang{'AT'} = 'de'; $country2lang{'BA'} = 'bs'; -$country2lang{'BE'} = 'nl'; +$country2lang{'BE'} = 'nl, fr'; $country2lang{'BG'} = 'bg'; $country2lang{'CZ'} = 'cs'; +$country2lang{'DE'} = 'de'; $country2lang{'DK'} = 'da'; $country2lang{'EE'} = 'et'; $country2lang{'ES'} = 'es'; -$country2lang{'ES-CT'} = 'ca'; +$country2lang{'ES-AN'} = 'es'; +$country2lang{'ES-CT'} = 'ca, es'; $country2lang{'ES-GA'} = 'gl'; -$country2lang{'ES-PV'} = 'eu'; +$country2lang{'ES-PV'} = 'eu, es'; $country2lang{'FI'} = 'fi'; $country2lang{'FR'} = 'fr'; $country2lang{'GB'} = 'en'; @@ -49,8 +51,9 @@ $country2lang{'RS'} = 'sr'; $country2lang{'SE'} = 'sv'; $country2lang{'SI'} = 'sl'; +$country2lang{'SK'} = 'sk'; $country2lang{'TR'} = 'tr'; -$country2lang{'UA'} = 'uk'; +$country2lang{'UA'} = 'uk, ru'; $bkpName = "BKP"; $Saxon = "java -jar $Bin/bin/saxon.jar"; @@ -106,7 +109,9 @@ push(@missing_taxonomies, $taxonomyFName) } else {print STDERR "WARN: Inserting forced taxonomy file $taxonomyFName\n"} - my $command = "$Saxon if-lang-missing=skip langs='$country2lang{$country}' -xsl:$scriptTaxonomy"; + my $Language = $country2lang{$Country}; + $Language =~ s/, .+//; #For multilingual corpora take the first language as main language + my $command = "$Saxon if-lang-missing=skip langs='$Language' -xsl:$scriptTaxonomy"; `$command $CommonTaxonomyFile > $taxonomyFile`; } } diff --git a/Scripts/parlamint2conllu.pl b/Scripts/parlamint2conllu.pl index a715cf906..11cf49e75 100755 --- a/Scripts/parlamint2conllu.pl +++ b/Scripts/parlamint2conllu.pl @@ -31,12 +31,14 @@ sub usage $country2lang{'AT'} = 'de'; $country2lang{'BA'} = 'bs'; -$country2lang{'BE'} = 'fr, nl'; +$country2lang{'BE'} = 'nl, fr'; $country2lang{'BG'} = 'bg'; $country2lang{'CZ'} = 'cs'; +$country2lang{'DE'} = 'de'; $country2lang{'DK'} = 'da'; $country2lang{'EE'} = 'et'; $country2lang{'ES'} = 'es'; +$country2lang{'ES-AN'} = 'es'; $country2lang{'ES-CT'} = 'ca, es'; $country2lang{'ES-GA'} = 'gl'; $country2lang{'ES-PV'} = 'eu, es'; @@ -58,6 +60,7 @@ sub usage $country2lang{'RS'} = 'sr'; $country2lang{'SE'} = 'sv'; $country2lang{'SI'} = 'sl'; +$country2lang{'SK'} = 'sk'; $country2lang{'TR'} = 'tr'; $country2lang{'UA'} = 'uk, ru'; diff --git a/Scripts/parlamint2distro.pl b/Scripts/parlamint2distro.pl index 891602f49..e0bb7d620 100755 --- a/Scripts/parlamint2distro.pl +++ b/Scripts/parlamint2distro.pl @@ -124,15 +124,17 @@ sub usage { # Mapping of countries to languages, we need it for mapping of common taxonomies $country2lang{'AT'} = 'de'; $country2lang{'BA'} = 'bs'; -$country2lang{'BE'} = 'nl'; +$country2lang{'BE'} = 'nl, fr'; $country2lang{'BG'} = 'bg'; $country2lang{'CZ'} = 'cs'; +$country2lang{'DE'} = 'de'; $country2lang{'DK'} = 'da'; $country2lang{'EE'} = 'et'; $country2lang{'ES'} = 'es'; -$country2lang{'ES-CT'} = 'ca'; +$country2lang{'ES-AN'} = 'es'; +$country2lang{'ES-CT'} = 'ca, es'; $country2lang{'ES-GA'} = 'gl'; -$country2lang{'ES-PV'} = 'eu'; +$country2lang{'ES-PV'} = 'eu, es'; $country2lang{'FI'} = 'fi'; $country2lang{'FR'} = 'fr'; $country2lang{'GB'} = 'en'; @@ -151,8 +153,9 @@ sub usage { $country2lang{'RS'} = 'sr'; $country2lang{'SE'} = 'sv'; $country2lang{'SI'} = 'sl'; +$country2lang{'SK'} = 'sk'; $country2lang{'TR'} = 'tr'; -$country2lang{'UA'} = 'uk'; +$country2lang{'UA'} = 'uk, ru'; # Fake country for testing: $country2lang{'XX'} = 'hr'; @@ -403,8 +406,10 @@ sub commonTaxonomies { if ($taxonomy !~ /\.ana/ or ($taxonomy =~ /\.ana/ and ($outDir =~ /\.ana/ or $outDir !~ /\.TEI/))) { if (-e $taxonomy{$taxonomy}) { - if (exists($country2lang{$Country})) { - my $command = "$Saxon if-lang-missing=skip langs='$country2lang{$Country}' -xsl:$scriptTaxonomy"; + if (exists($country2lang{$Country})) { + my $Language = $country2lang{$Country}; + $Language =~ s/, .+//; #For multilingual corpora take the first language as main language + my $command = "$Saxon if-lang-missing=skip langs='$Language' -xsl:$scriptTaxonomy"; `$command $taxonomy{$taxonomy} > $outDir/$taxonomy.xml`; } else { diff --git a/Scripts/parlamintp2conllu.pl b/Scripts/parlamintp2conllu.pl index 45c46e288..62d6c41de 100755 --- a/Scripts/parlamintp2conllu.pl +++ b/Scripts/parlamintp2conllu.pl @@ -38,12 +38,14 @@ sub usage #This should be somehow factorised out!! $country2lang{'AT'} = 'de'; $country2lang{'BA'} = 'sr'; # Should be 'bs', but UD does not support it! -$country2lang{'BE'} = 'fr, nl'; +$country2lang{'BE'} = 'nl, fr'; $country2lang{'BG'} = 'bg'; $country2lang{'CZ'} = 'cs'; +$country2lang{'DE'} = 'de'; $country2lang{'DK'} = 'da'; $country2lang{'EE'} = 'et'; $country2lang{'ES'} = 'es'; +$country2lang{'ES-AN'} = 'es'; $country2lang{'ES-CT'} = 'ca, es'; $country2lang{'ES-GA'} = 'gl'; $country2lang{'ES-PV'} = 'eu, es'; @@ -65,6 +67,7 @@ sub usage $country2lang{'RS'} = 'sr'; $country2lang{'SE'} = 'sv'; $country2lang{'SI'} = 'sl'; +$country2lang{'SK'} = 'sk'; $country2lang{'TR'} = 'tr'; $country2lang{'UA'} = 'uk, ru'; # Fake country for testing: