forked from WING-NUS/ACL-Anthology-Codebase
-
Notifications
You must be signed in to change notification settings - Fork 0
/
AnthoXML2DBLPvBib.rb
executable file
·175 lines (156 loc) · 4.45 KB
/
AnthoXML2DBLPvBib.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#!/usr/bin/env ruby
# -*- ruby -*-
require "rexml/document"
require "rexml/xpath"
require 'optparse'
require 'ostruct'
require 'time'
# defaults
@@VERSION = [1,0]
@@INTERVAL = 100
@@PROG_NAME = File.basename($0)
@@DEBUG = false
@@OPT_X = false
############################################################
# EXCEPTION HANDLING
int_handler = proc {
# clean up code goes here
STDERR.puts "\n# #{@@PROG_NAME} fatal\t\tReceived a 'SIGINT'\n# #{@@PROG_NAME}\t\texiting cleanly"
exit -1
}
trap "SIGINT", int_handler
def canonicalize(str)
str.sub!(/\s+$/,"")
str.sub!(/^\s+/,"")
str
end
def process_volume(p, prefix, in_volume)
pid = p.attributes["id"]
buf = ""
if (in_volume == true)
buf += "</ul>\n"
end
buf += "<h2>" + canonicalize(p.elements["title"].text) + "</h2>\n"
buf += "<ul>\n"
buf += "<li>X:\nFront Matter.\n0-\n"
buf += "<ee>#{prefix}#{pid}</ee>\n"
buf
end
def process_paper(p, prefix, path, stem)
# init
authors = ""
page_range = "0-"
pid = p.attributes["id"]
href = p.attributes["href"]
ee = ""
# authors
authors = Array.new
p.elements.each("author") { |a|
#if (a.elements["first"].text != nil)
if (a.elements["first"] != nil) # Thang fix
author = canonicalize(a.elements["first"].text) + " " + canonicalize(a.elements["last"].text)
else
author = a.text
begin
if (author.match(/,/))
STDERR.puts "comma detected in author name for paper ##{pid}\t#{author}"
end
rescue
# no author
end
end
if (author == nil)
STDERR.puts "No author correctly found for paper ##{pid}, dying!"
exit(-1)
end
authors.push(canonicalize(author))
}
authorString = authors.join(", ") + ":"
# title
title = canonicalize(p.elements["title"].text)
if title.match(/[a-z0-9A-Z]$/)
title += "."
end
# ee (points now to anthology, use -x to extract DOI from bib file)
ee = prefix + pid
# page range (from <pages> in XML, else from bib file if present)
doi = ""
pages = p.elements["pages"]
if (pages != nil)
page_range = pages.text
else
bib_file = "#{path}#{stem}-#{pid}.bib"
pages = ""
begin
bf = File.open(bib_file)
bf.each { |l|
if (md = l.match(/^\s*doi\s*=\s*\{(.+)\}\s*/)) then doi = md[1] end
if (md = l.match(/^\s*pages\s*=\s*\{(.+)\}\s*/)) then pages = md[1] end
if (pages = l.match(/\-/))
page_range = pages.sub(/\-/,"")
else
page_range = pages.sub(/–/,"")
end
ee = doi if @@OPT_X
}
bf.close
rescue
STDERR.puts "Warning no bib file for #{pid}."
end
end
if (@@DEBUG) then puts "BIBFILE #{bib_file}" end
if (href != nil)
buf = "<li>#{authorString}\n#{title}\n#{page_range}\n<ee>#{href}</ee>\n"
else
buf = "<li>#{authorString}\n#{title}\n#{page_range}\n<ee>#{ee}</ee>\n"
end
end
############################################################
# set up options
# @@options = OpenStruct.new
@@MODE = "conference"
OptionParser.new do |opts|
opts.banner = "usage: #{@@PROG_NAME} [options] antho_file.xml > dblp_file.html"
opts.separator ""
opts.on_tail("-d", "--debug", "Turn record matching debugging on") do @@DEBUG = true end
opts.on_tail("-m", "--mode [MODE]", "operation mode (either workshop or default: conference)") do |opt|
@@MODE = opt || nil
end
opts.on_tail("-h", "--help", "Show this message") do puts opts; exit end
opts.on_tail("-v", "--version", "Show version") do puts "#{@@PROG_NAME} " + @@VERSION.join('.'); exit end
opts.on_tail("-x", "--extract_doi", "Electronic edition to DOI instead of ACL Anthology") do @@OPT_X = true end
end.parse!
############################################################
# Main program
f = File.open(ARGV[0])
basename = ""
path = ""
if (md = ARGV[0].match(/(.+\/)([^\/]+$)/))
basename = md[-1]
path = md[1]
else
basename = ARGV[0]
path = ""
end
case @@MODE
when "workshop"
stem = basename.split(/\./)[0..-2].join
stem = stem.split(/\-/)[0..-2].join
volumePattern = "00";
when "conference"
stem = basename.split(/\./)[0..-2].join
volumePattern = "000";
end
prefix = "http://www.aclweb.org/anthology/#{stem}-"
doc = REXML::Document.new f
in_volume = false
doc.elements.each("//paper") { |p|
if (p.attributes["id"].match(/#{volumePattern}$/)) #Thang modify replace 000 by #{volumePattern}
buf = process_volume(p,prefix,in_volume)
else
buf = process_paper(p,prefix,path,stem)
in_volume = true
end
puts buf
}
puts "</ul>"