001package org.maltparser.core.syntaxgraph.reader; 002 003import java.io.BufferedReader; 004import java.io.FileInputStream; 005import java.io.FileNotFoundException; 006import java.io.IOException; 007import java.io.InputStream; 008import java.io.InputStreamReader; 009import java.io.UnsupportedEncodingException; 010import java.net.URL; 011import java.util.Iterator; 012import java.util.SortedMap; 013 014import org.maltparser.core.exception.MaltChainedException; 015import org.maltparser.core.io.dataformat.ColumnDescription; 016import org.maltparser.core.io.dataformat.DataFormatException; 017import org.maltparser.core.io.dataformat.DataFormatInstance; 018import org.maltparser.core.symbol.SymbolTableHandler; 019import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph; 020import org.maltparser.core.syntaxgraph.PhraseStructure; 021import org.maltparser.core.syntaxgraph.TokenStructure; 022import org.maltparser.core.syntaxgraph.edge.Edge; 023import org.maltparser.core.syntaxgraph.node.PhraseStructureNode; 024import org.maltparser.core.syntaxgraph.node.TokenNode; 025/** 026* 027* 028* @author Johan Hall 029*/ 030public class BracketReader implements SyntaxGraphReader { 031 private BufferedReader reader; 032 private DataFormatInstance dataFormatInstance; 033 private int sentenceCount; 034 private StringBuilder input; 035 private int terminalCounter; 036 private int nonTerminalCounter; 037 private String optionString; 038 private SortedMap<String,ColumnDescription> inputColumns; 039 private SortedMap<String,ColumnDescription> edgeLabelColumns; 040 private SortedMap<String,ColumnDescription> phraseLabelColumns; 041 042 private String fileName = null; 043 private URL url = null; 044 private String charsetName; 045 private int nIterations; 046 private int cIterations; 047 private boolean closeStream = true; 048 049 private char STARTING_BRACKET = '('; 050 private char CLOSING_BRACKET = ')'; 051 private char INPUT_SEPARATOR = ' '; 052 private char EDGELABEL_SEPARATOR = '-'; 053 private char SENTENCE_SEPARATOR = '\n'; 054 private char BLANK = ' '; 055 private char CARRIAGE_RETURN = '\r'; 056 private char TAB = '\t'; 057 058 public BracketReader() { 059 input = new StringBuilder(); 060 nIterations = 1; 061 cIterations = 1; 062 } 063 064 private void reopen() throws MaltChainedException { 065 close(); 066 if (fileName != null) { 067 open(fileName, charsetName); 068 } else if (url != null) { 069 open(url, charsetName); 070 } else { 071 throw new DataFormatException("The input stream cannot be reopen. "); 072 } 073 } 074 075 public void open(String fileName, String charsetName) throws MaltChainedException { 076 setFileName(fileName); 077 setCharsetName(charsetName); 078 try { 079 open(new FileInputStream(fileName), charsetName); 080 }catch (FileNotFoundException e) { 081 throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e); 082 } 083 } 084 public void open(URL url, String charsetName) throws MaltChainedException { 085 setUrl(url); 086 setCharsetName(charsetName); 087 try { 088 open(url.openStream(), charsetName); 089 } catch (IOException e) { 090 throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e); 091 } 092 } 093 094 public void open(InputStream is, String charsetName) throws MaltChainedException { 095 try { 096 if (is == System.in) { 097 closeStream = false; 098 } 099 open(new InputStreamReader(is, charsetName)); 100 } catch (UnsupportedEncodingException e) { 101 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e); 102 } 103 } 104 105 private void open(InputStreamReader isr) throws MaltChainedException { 106 setReader(new BufferedReader(isr)); 107 setSentenceCount(0); 108 } 109 110 public void readProlog() throws MaltChainedException { 111 112 } 113 114 public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException { 115 if (syntaxGraph == null || dataFormatInstance == null) { 116 return false; 117 } 118 syntaxGraph.clear(); 119 int brackets = 0; 120 try { 121 int l = reader.read(); 122 char c; 123 input.setLength(0); 124 125 while (true) { 126 if (l == -1) { 127 input.setLength(0); 128 return false; 129 } 130 131 c = (char)l; 132 l = reader.read(); 133 134 if (c == SENTENCE_SEPARATOR || c == CARRIAGE_RETURN || c == TAB || c == -1) { 135 136 } else if (c == STARTING_BRACKET) { 137 input.append(c); 138 brackets++; 139 } else if (c == CLOSING_BRACKET) { 140 input.append(c); 141 brackets--; 142 } else if (c == INPUT_SEPARATOR) { 143 if (l != STARTING_BRACKET && l != CLOSING_BRACKET && l != INPUT_SEPARATOR && l != SENTENCE_SEPARATOR && l != CARRIAGE_RETURN && l != TAB && l != -1) { 144 input.append(c); 145 } 146 // Start BracketProgLangReader 147 } else if (c == '\\') { 148 c = (char) l; 149 l = reader.read(); 150 if (c != ' ' && c != '(' && c != ')' && c != '\\' && c != 'n' && c != 'r' && c != 't' && c != '\"' && c != '\'') { 151// System.out.println("Error"); 152 System.exit(1); 153 } else { 154 input.append("\\" + c); 155 } 156 // End BracketProgLangReader 157 } else if (brackets != 0){ 158 input.append(c); 159 } 160 if (brackets == 0 && input.length() != 0) { 161 sentenceCount++; 162 terminalCounter = 1; 163 nonTerminalCounter = 1; 164 if (syntaxGraph instanceof PhraseStructure) { 165 bracketing((PhraseStructure)syntaxGraph, 0, input.length(), null); 166 if (syntaxGraph instanceof MappablePhraseStructureGraph) { 167 ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot()); 168 } 169 } 170 return true; 171 } 172 173 if (c == -1) { 174 if (brackets != 0) { 175 close(); 176 throw new MaltChainedException("Error when reading from the input file. "); 177 } 178 if (cIterations < nIterations) { 179 cIterations++; 180 reopen(); 181 return true; 182 } 183 return false; 184 } 185 } 186 } catch (IOException e) { 187 close(); 188 throw new MaltChainedException("Error when reading from the input file. ", e); 189 } 190 191 } 192 193 private void bracketing(PhraseStructure phraseStructure, int start, int end, PhraseStructureNode parent) throws MaltChainedException { 194 int bracketsdepth = 0; 195 int startpos = start-1; 196 for (int i = start, n = end; i < n; i++) { 197 if (input.charAt(i) == STARTING_BRACKET 198 // Start BracketProgLangReader 199 && (i == 0 || input.charAt(i - 1) != '\\') 200 // end BracketProgLangReader 201 202 ) { 203 if (bracketsdepth == 0) { 204 startpos = i; 205 } 206 bracketsdepth++; 207 } else if (input.charAt(i) == CLOSING_BRACKET 208 // Start BracketProgLangReader 209 && (i == 0 || input.charAt(i - 1) != '\\') 210 // end BracketProgLangReader 211 ) { 212 bracketsdepth--; 213 if (bracketsdepth == 0) { 214 extract(phraseStructure, startpos+1, i, parent); 215 } 216 } 217 } 218 } 219 220 private void extract(PhraseStructure phraseStructure, int begin, int end, PhraseStructureNode parent) throws MaltChainedException { 221 SymbolTableHandler symbolTables = phraseStructure.getSymbolTables(); 222 int index = -1; 223 for (int i = begin; i < end; i++) { 224 if (input.charAt(i) == STARTING_BRACKET 225 // Start BracketProgLangReader 226 && (i == begin || input.charAt(i - 1) != '\\') 227 // end BracketProgLangReader 228 ) { 229 index = i; 230 break; 231 } 232 } 233 if (index == -1) { 234 TokenNode t = phraseStructure.addTokenNode(terminalCounter); 235 if (t == null) { 236 close(); 237 throw new MaltChainedException("Bracket Reader error: could not create a terminal node. "); 238 } 239 240 terminalCounter++; 241 Edge e = null; 242 243 if (parent != null) { 244 e = phraseStructure.addPhraseStructureEdge(parent, (PhraseStructureNode)t); 245 } else { 246 close(); 247 throw new MaltChainedException("Bracket Reader error: could not find the parent node. "); 248 } 249 250 int start = begin; 251 252 Iterator<String> inputColumnsIterator = inputColumns.keySet().iterator(); 253 Iterator<String> edgeLabelsColumnsIterator = edgeLabelColumns.keySet().iterator(); 254 boolean noneNode = false; 255 boolean edgeLabels = false; 256 for (int i = begin; i < end; i++) { 257 if (input.charAt(i) == EDGELABEL_SEPARATOR || (input.charAt(i) == INPUT_SEPARATOR 258 // Start BracketProgLangReader 259 && (i == begin || input.charAt(i - 1) != '\\') 260 // end BracketProgLangReader 261 ) || i == end - 1) { 262 if (i == begin && input.charAt(i) == EDGELABEL_SEPARATOR) { 263 noneNode = true; 264 } else if (start == begin) { 265 if ((noneNode && input.charAt(i) != EDGELABEL_SEPARATOR) || !noneNode) { 266 if (inputColumnsIterator.hasNext()) { 267 268 t.addLabel(symbolTables.getSymbolTable(inputColumns.get(inputColumnsIterator.next()).getName()), 269 270 // Start BracketProgLangReader 271 decodeString( 272 // end BracketProgLangReader 273 (i == end - 1)?input.substring(start,end):input.substring(start, i) 274 // Start BracketProgLangReader 275 ) 276 // end BracketProgLangReader 277 ); 278 } 279 start = i + 1; 280 if (input.charAt(i) == EDGELABEL_SEPARATOR) { 281 edgeLabels = true; 282 } 283 } 284 } else if (edgeLabels && e != null) { 285 if (edgeLabelsColumnsIterator.hasNext()) { 286 e.addLabel(symbolTables.getSymbolTable(edgeLabelColumns.get(edgeLabelsColumnsIterator.next()).getName()), (i == end - 1)?input.substring(start,end):input.substring(start, i)); 287 } 288 start = i + 1; 289 if (input.charAt(i) == INPUT_SEPARATOR 290 // Start BracketProgLangReader 291 && (i == begin || input.charAt(i - 1) != '\\') 292 // end BracketProgLangReader 293 ) { 294 edgeLabels = false; 295 } 296 } else if (input.charAt(i) == EDGELABEL_SEPARATOR && i != end - 1 && (input.charAt(i+1) != INPUT_SEPARATOR 297 // Start BracketProgLangReader 298 && (i == begin || input.charAt(i - 1) != '\\') 299 // end BracketProgLangReader 300 ) 301 ) { 302 } else { 303 if (inputColumnsIterator.hasNext()) { 304 t.addLabel(symbolTables.getSymbolTable(inputColumns.get(inputColumnsIterator.next()).getName()), (i == end - 1)?input.substring(start,end):input.substring(start, i)); 305 } 306 start = i + 1; 307 } 308 } 309 } 310 } else { 311 PhraseStructureNode nt; 312 Edge e = null; 313 if (parent == null) { 314 nt = phraseStructure.getPhraseStructureRoot(); 315 } else { 316 nt = phraseStructure.addNonTerminalNode(nonTerminalCounter); 317 if (nt == null) { 318 close(); 319 throw new MaltChainedException("Bracket Reader error: could not create a nonterminal node. "); 320 } 321 nonTerminalCounter++; 322 323 e = phraseStructure.addPhraseStructureEdge(parent, nt); 324 } 325 Iterator<String> phraseLabelColumnsIterator = phraseLabelColumns.keySet().iterator(); 326 Iterator<String> edgeLabelsColumnsIterator = edgeLabelColumns.keySet().iterator(); 327 int newbegin = begin; 328 int start = begin; 329 330 for (int i = begin; i < index; i++) { 331 if (input.charAt(i) == EDGELABEL_SEPARATOR || i == index - 1) { 332 if (start == newbegin) { 333 if (phraseLabelColumnsIterator.hasNext()) { 334 nt.addLabel(symbolTables.getSymbolTable(phraseLabelColumns.get(phraseLabelColumnsIterator.next()).getName()), (i == index - 1)?input.substring(start,index):input.substring(start, i)); 335 } 336 start = i + 1; 337 } else if (e != null) { 338 if (edgeLabelsColumnsIterator.hasNext()) { 339 e.addLabel(symbolTables.getSymbolTable(edgeLabelColumns.get(edgeLabelsColumnsIterator.next()).getName()), (i == index - 1)?input.substring(start,index):input.substring(start, i)); 340 } 341 start = i + 1; 342 } 343 } else if (input.charAt(i) == BLANK) { 344 start++; 345 newbegin++; 346 } 347 } 348 349 bracketing(phraseStructure, index, end, nt); 350 } 351 } 352 353 private String decodeString(String string) { 354 return string.replace("\\(", "(").replace("\\)", ")").replace("\\ ", " "); 355 } 356 357 public void readEpilog() throws MaltChainedException { 358 359 } 360 361 public BufferedReader getReader() { 362 return reader; 363 } 364 365 public void setReader(BufferedReader reader) { 366 this.reader = reader; 367 } 368 369 public int getSentenceCount() throws MaltChainedException { 370 return sentenceCount; 371 } 372 373 public void setSentenceCount(int sentenceCount) { 374 this.sentenceCount = sentenceCount; 375 } 376 377 public DataFormatInstance getDataFormatInstance() { 378 return dataFormatInstance; 379 } 380 381 public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) { 382 this.dataFormatInstance = inputDataFormatInstance; 383 inputColumns = dataFormatInstance.getInputColumnDescriptions(); 384 edgeLabelColumns = dataFormatInstance.getPhraseStructureEdgeLabelColumnDescriptions(); 385 phraseLabelColumns = dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptions(); 386 } 387 388 public String getOptions() { 389 return optionString; 390 } 391 392 public void setOptions(String optionString) throws MaltChainedException { 393 this.optionString = optionString; 394 } 395 396 public String getFileName() { 397 return fileName; 398 } 399 400 public void setFileName(String fileName) { 401 this.fileName = fileName; 402 } 403 404 public URL getUrl() { 405 return url; 406 } 407 408 public void setUrl(URL url) { 409 this.url = url; 410 } 411 412 public String getCharsetName() { 413 return charsetName; 414 } 415 416 public void setCharsetName(String charsetName) { 417 this.charsetName = charsetName; 418 } 419 420 public int getNIterations() { 421 return nIterations; 422 } 423 424 public void setNIterations(int iterations) { 425 nIterations = iterations; 426 } 427 428 public int getIterationCounter() { 429 return cIterations; 430 } 431 432 public void close() throws MaltChainedException { 433 try { 434 if (reader != null) { 435 if (closeStream) { 436 reader.close(); 437 } 438 reader = null; 439 } 440 } catch (IOException e) { 441 throw new DataFormatException("Error when closing the input file.", e); 442 } 443 } 444}