001 package org.maltparser.core.syntaxgraph.reader;
002
003 import java.io.BufferedReader;
004 import java.io.FileInputStream;
005 import java.io.FileNotFoundException;
006 import java.io.IOException;
007 import java.io.InputStream;
008 import java.io.InputStreamReader;
009 import java.io.UnsupportedEncodingException;
010 import java.net.URL;
011 import java.util.Iterator;
012 import java.util.SortedMap;
013 import java.util.TreeMap;
014 import java.util.regex.PatternSyntaxException;
015
016 import org.maltparser.core.exception.MaltChainedException;
017 import org.maltparser.core.io.dataformat.ColumnDescription;
018 import org.maltparser.core.io.dataformat.DataFormatException;
019 import org.maltparser.core.io.dataformat.DataFormatInstance;
020 import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph;
021 import org.maltparser.core.syntaxgraph.PhraseStructure;
022 import org.maltparser.core.syntaxgraph.TokenStructure;
023 import org.maltparser.core.syntaxgraph.edge.Edge;
024 import org.maltparser.core.syntaxgraph.node.PhraseStructureNode;
025
026 /**
027 *
028 *
029 * @author Johan Hall
030 */
031 public class NegraReader implements SyntaxGraphReader {
032 private enum NegraTables {
033 ORIGIN, EDITOR, WORDTAG, MORPHTAG, NODETAG, EDGETAG, SECEDGETAG, SENTENCE, UNDEF
034 };
035 private BufferedReader reader;
036 private DataFormatInstance dataFormatInstance;
037 private int sentenceCount;
038 private String optionString;
039 private int formatVersion;
040 private NegraTables currentHeaderTable;
041 private int currentTerminalSize;
042 private int currentNonTerminalSize;
043 private SortedMap<Integer,PhraseStructureNode> nonterminals;
044 private StringBuilder edgelabelSymbol;
045 private StringBuilder edgelabelTableName;
046 private int START_ID_OF_NONTERMINALS = 500;
047 private String fileName = null;
048 private URL url = null;
049 private String charsetName;
050 private int nIterations;
051 private int cIterations;
052 private boolean closeStream = true;
053
054 public NegraReader() {
055 currentHeaderTable = NegraTables.UNDEF;
056 edgelabelSymbol = new StringBuilder();
057 edgelabelTableName = new StringBuilder();
058 nonterminals = new TreeMap<Integer,PhraseStructureNode>();
059 nIterations = 1;
060 cIterations = 1;
061 }
062
063 private void reopen() throws MaltChainedException {
064 close();
065 if (fileName != null) {
066 open(fileName, charsetName);
067 } else if (url != null) {
068 open(url, charsetName);
069 } else {
070 throw new DataFormatException("The input stream cannot be reopen. ");
071 }
072 }
073
074 public void open(String fileName, String charsetName) throws MaltChainedException {
075 setFileName(fileName);
076 setCharsetName(charsetName);
077 try {
078 open(new FileInputStream(fileName), charsetName);
079 } catch (FileNotFoundException e) {
080 throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e);
081 }
082 }
083 public void open(URL url, String charsetName) throws MaltChainedException {
084 setUrl(url);
085 setCharsetName(charsetName);
086 try {
087 open(url.openStream(), charsetName);
088 } catch (IOException e) {
089 throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e);
090 }
091 }
092
093 public void open(InputStream is, String charsetName) throws MaltChainedException {
094 try {
095 if (is == System.in) {
096 closeStream = false;
097 }
098 open(new InputStreamReader(is, charsetName));
099 } catch (UnsupportedEncodingException e) {
100 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e);
101 }
102 }
103
104 private void open(InputStreamReader isr) throws MaltChainedException {
105 setReader(new BufferedReader(isr));
106 setSentenceCount(0);
107 }
108
109 public void readProlog() throws MaltChainedException {
110
111 }
112
113 public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException {
114 if (syntaxGraph == null || !(syntaxGraph instanceof PhraseStructure)) {
115 return false;
116 }
117 syntaxGraph.clear();
118 final PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph;
119 PhraseStructureNode parent = null;
120 PhraseStructureNode child = null;
121 currentHeaderTable = NegraTables.UNDEF;
122 String line = null;
123 syntaxGraph.clear();
124 nonterminals.clear();
125 try {
126 while (true) {
127 line = reader.readLine();
128 if (line == null) {
129 if (syntaxGraph.hasTokens()) {
130 sentenceCount++;
131 if (syntaxGraph instanceof MappablePhraseStructureGraph) {
132 ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot());
133 }
134 }
135 if (cIterations < nIterations) {
136 cIterations++;
137 reopen();
138 return true;
139 }
140 return false;
141 } else if (line.startsWith("#EOS")) {
142 currentTerminalSize = 0;
143 currentNonTerminalSize = 0;
144 currentHeaderTable = NegraTables.UNDEF;
145 if (syntaxGraph instanceof MappablePhraseStructureGraph) {
146 ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot());
147 }
148 return true;
149 } else if (line.startsWith("#BOS")) {
150 currentHeaderTable = NegraTables.SENTENCE;
151 int s = -1, e = -1;
152 for (int i = 5, n = line.length(); i < n; i++) {
153 if (Character.isDigit(line.charAt(i)) && s == -1) {
154 s = i;
155 }
156 if (line.charAt(i) == ' ') {
157 e = i;
158 break;
159 }
160 }
161 if (s != e && s != -1 && e != -1) {
162 phraseStructure.setSentenceID(Integer.parseInt(line.substring(s,e)));
163 }
164 sentenceCount++;
165 } else if (currentHeaderTable == NegraTables.SENTENCE) {
166 if (line.length() >= 2 && line.charAt(0) == '#' && Character.isDigit(line.charAt(1))) { // Non-terminal
167 Iterator<ColumnDescription> columns = dataFormatInstance.iterator();
168 ColumnDescription column = null;
169 currentNonTerminalSize++;
170 char[] lineChars = line.toCharArray();
171 int start = 0;
172 int secedgecounter = 0;
173 for (int i = 0, n = lineChars.length; i < n; i++) {
174 if (lineChars[i] == '\t' && start == i) {
175 start++;
176 } else if (lineChars[i] == '\t' || i == n - 1) {
177 if (columns.hasNext()) {
178 column = columns.next();
179 }
180 if (column.getPosition() == 0) {
181 int index = Integer.parseInt((i == n - 1)?line.substring(start+1):line.substring(start+1, i));
182 child = nonterminals.get(index);
183 if (child == null) {
184 if (index != 0) {
185 child = ((PhraseStructure)syntaxGraph).addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
186 }
187 nonterminals.put(index,child);
188 }
189 } else if (column.getPosition() == 2 && child != null) {
190 syntaxGraph.addLabel(child, "CAT", (i == n - 1)?line.substring(start):line.substring(start, i));
191 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_EDGE_LABEL) {
192 edgelabelSymbol.setLength(0);
193 edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i));
194 edgelabelTableName.setLength(0);
195 edgelabelTableName.append(column.getName());
196 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_NODE_LABEL && child != null) {
197 int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i));
198 parent = nonterminals.get(index);
199 if (parent == null) {
200 if (index == 0) {
201 parent = phraseStructure.getPhraseStructureRoot();
202 } else {
203 parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
204 }
205 nonterminals.put(index,parent);
206 }
207 Edge e = phraseStructure.addPhraseStructureEdge(parent, child);
208 syntaxGraph.addLabel(e, edgelabelTableName.toString(), edgelabelSymbol.toString());
209 } else if (column.getCategory() == ColumnDescription.SECONDARY_EDGE_LABEL && child != null) {
210 if (secedgecounter % 2 == 0) {
211 edgelabelSymbol.setLength(0);
212 edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i));
213 secedgecounter++;
214 } else {
215 int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i));
216 if (index == 0) {
217 parent = phraseStructure.getPhraseStructureRoot();
218 } else if (index < START_ID_OF_NONTERMINALS) {
219 parent = phraseStructure.getTokenNode(index);
220 } else {
221 parent = nonterminals.get(index);
222 if (parent == null) {
223 parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
224 nonterminals.put(index,parent);
225 }
226 }
227 Edge e = phraseStructure.addSecondaryEdge(parent, child);
228 e.addLabel(column.getSymbolTable(), edgelabelSymbol.toString());
229 secedgecounter++;
230 }
231 }
232 start = i + 1;
233 }
234 }
235 } else { // Terminal
236 Iterator<ColumnDescription> columns = dataFormatInstance.iterator();
237 ColumnDescription column = null;
238
239 currentTerminalSize++;
240 child = syntaxGraph.addTokenNode(currentTerminalSize);
241 char[] lineChars = line.toCharArray();
242 int start = 0;
243 int secedgecounter = 0;
244 for (int i = 0, n = lineChars.length; i < n; i++) {
245 if (lineChars[i] == '\t' && start == i) {
246 start++;
247 } else if (lineChars[i] == '\t' || i == n - 1) {
248 if (columns.hasNext()) {
249 column = columns.next();
250 }
251 if (column.getCategory() == ColumnDescription.INPUT && child != null) {
252 syntaxGraph.addLabel(child, column.getName(), (i == n - 1)?line.substring(start):line.substring(start, i));
253 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_EDGE_LABEL && child != null) { // && column.getName().equals("EDGELABEL")) {
254 edgelabelSymbol.setLength(0);
255 edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i));
256 edgelabelTableName.setLength(0);
257 edgelabelTableName.append(column.getName());
258 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_NODE_LABEL && child != null) {
259 int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i));
260 parent = nonterminals.get(index);
261 if (parent == null) {
262 if (index == 0) {
263 parent = phraseStructure.getPhraseStructureRoot();
264 } else {
265 parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
266 }
267 nonterminals.put(index,parent);
268 }
269
270 Edge e = phraseStructure.addPhraseStructureEdge(parent, child);
271 syntaxGraph.addLabel(e, edgelabelTableName.toString(), edgelabelSymbol.toString());
272 } else if (column.getCategory() == ColumnDescription.SECONDARY_EDGE_LABEL && child != null) {
273 if (secedgecounter % 2 == 0) {
274 edgelabelSymbol.setLength(0);
275 edgelabelSymbol.append((i == n - 1)?line.substring(start):line.substring(start, i));
276 secedgecounter++;
277 } else {
278 int index = Integer.parseInt((i == n - 1)?line.substring(start):line.substring(start, i));
279 if (index == 0) {
280 parent = phraseStructure.getPhraseStructureRoot();
281 } else if (index < START_ID_OF_NONTERMINALS) {
282 parent = phraseStructure.getTokenNode(index);
283 } else {
284 parent = nonterminals.get(index);
285 if (parent == null) {
286 parent = phraseStructure.addNonTerminalNode(index-START_ID_OF_NONTERMINALS+1);
287 nonterminals.put(index,parent);
288 }
289 }
290 Edge e = phraseStructure.addSecondaryEdge(parent, child);
291 e.addLabel(column.getSymbolTable(), edgelabelSymbol.toString());
292 secedgecounter++;
293 }
294 }
295 start = i + 1;
296 }
297 }
298 }
299 } else if (line.startsWith("%%")) { // comment skip
300
301 } else if (line.startsWith("#FORMAT")) {
302 // int index = line.indexOf(' ');
303 // if (index > -1) {
304 // try {
305 // formatVersion = Integer.parseInt(line.substring(index+1));
306 // } catch (NumberFormatException e) {
307 //
308 // }
309 // }
310 } else if (line.startsWith("#BOT")) {
311 // int index = line.indexOf(' ');
312 // if (index > -1) {
313 // if (line.substring(index+1).equals("ORIGIN")) {
314 // currentHeaderTable = NegraTables.ORIGIN;
315 // } else if (line.substring(index+1).equals("EDITOR")) {
316 // currentHeaderTable = NegraTables.EDITOR;
317 // } else if (line.substring(index+1).equals("WORDTAG")) {
318 // currentHeaderTable = NegraTables.WORDTAG;
319 // } else if (line.substring(index+1).equals("MORPHTAG")) {
320 // currentHeaderTable = NegraTables.MORPHTAG;
321 // } else if (line.substring(index+1).equals("NODETAG")) {
322 // currentHeaderTable = NegraTables.NODETAG;
323 // } else if (line.substring(index+1).equals("EDGETAG")) {
324 // currentHeaderTable = NegraTables.EDGETAG;
325 // } else if (line.substring(index+1).equals("SECEDGETAG")) {
326 // currentHeaderTable = NegraTables.SECEDGETAG;
327 // } else {
328 // currentHeaderTable = NegraTables.UNDEF;
329 // }
330 // }
331 } else if (line.startsWith("#EOT")) {
332 currentHeaderTable = NegraTables.UNDEF;
333 }
334 }
335 } catch (IOException e) {
336 throw new DataFormatException("Error when reading from the input file. ", e);
337 }
338 }
339
340 public void readEpilog() throws MaltChainedException {
341
342 }
343
344 public BufferedReader getReader() {
345 return reader;
346 }
347
348 public void setReader(BufferedReader reader) {
349 this.reader = reader;
350 }
351
352 public int getSentenceCount() {
353 return sentenceCount;
354 }
355
356 public void setSentenceCount(int sentenceCount) {
357 this.sentenceCount = sentenceCount;
358 }
359
360 public int getFormatVersion() {
361 return formatVersion;
362 }
363
364 public void setFormatVersion(int formatVersion) {
365 this.formatVersion = formatVersion;
366 }
367
368 public DataFormatInstance getDataFormatInstance() {
369 return dataFormatInstance;
370 }
371
372 public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) {
373 this.dataFormatInstance = inputDataFormatInstance;
374 }
375
376 public String getOptions() {
377 return optionString;
378 }
379
380 public void setOptions(String optionString) throws MaltChainedException {
381 this.optionString = optionString;
382
383 String[] argv;
384 try {
385 argv = optionString.split("[_\\p{Blank}]");
386 } catch (PatternSyntaxException e) {
387 throw new DataFormatException("Could not split the penn writer option '"+optionString+"'. ", e);
388 }
389 for (int i=0; i < argv.length-1; i++) {
390 if(argv[i].charAt(0) != '-') {
391 throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0));
392 }
393 if(++i>=argv.length) {
394 throw new DataFormatException("The last argument does not have any value. ");
395 }
396 switch(argv[i-1].charAt(1)) {
397 case 's':
398 try {
399 START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]);
400 } catch (NumberFormatException e){
401 throw new MaltChainedException("The TigerXML Reader option -s must be an integer value. ");
402 }
403 break;
404 default:
405 throw new DataFormatException("Unknown NegraReader parameter: '"+argv[i-1]+"' with value '"+argv[i]+"'. ");
406 }
407 }
408 }
409
410 public String getFileName() {
411 return fileName;
412 }
413
414 public void setFileName(String fileName) {
415 this.fileName = fileName;
416 }
417
418 public URL getUrl() {
419 return url;
420 }
421
422 public void setUrl(URL url) {
423 this.url = url;
424 }
425
426 public String getCharsetName() {
427 return charsetName;
428 }
429
430 public void setCharsetName(String charsetName) {
431 this.charsetName = charsetName;
432 }
433
434 public int getNIterations() {
435 return nIterations;
436 }
437
438 public void setNIterations(int iterations) {
439 nIterations = iterations;
440 }
441
442 public int getIterationCounter() {
443 return cIterations;
444 }
445
446 public void close() throws MaltChainedException {
447 try {
448 if (reader != null) {
449 if (closeStream) {
450 reader.close();
451 }
452 reader = null;
453 }
454 } catch (IOException e) {
455 throw new DataFormatException("Error when closing the input file.", e);
456 }
457 }
458 }