001 package org.maltparser.core.syntaxgraph.writer;
002
003 import java.io.BufferedWriter;
004 import java.io.FileNotFoundException;
005 import java.io.FileOutputStream;
006 import java.io.IOException;
007 import java.io.OutputStream;
008 import java.io.OutputStreamWriter;
009 import java.io.UnsupportedEncodingException;
010 import java.util.Iterator;
011 import java.util.LinkedHashMap;
012 import java.util.SortedMap;
013 import java.util.TreeMap;
014 import java.util.regex.PatternSyntaxException;
015
016 import org.maltparser.core.exception.MaltChainedException;
017 import org.maltparser.core.io.dataformat.ColumnDescription;
018 import org.maltparser.core.io.dataformat.DataFormatException;
019 import org.maltparser.core.io.dataformat.DataFormatInstance;
020 import org.maltparser.core.syntaxgraph.PhraseStructure;
021 import org.maltparser.core.syntaxgraph.TokenStructure;
022 import org.maltparser.core.syntaxgraph.edge.Edge;
023 import org.maltparser.core.syntaxgraph.node.NonTerminalNode;
024 import org.maltparser.core.syntaxgraph.node.PhraseStructureNode;
025 /**
026 *
027 *
028 * @author Johan Hall
029 */
030 public class NegraWriter implements SyntaxGraphWriter {
031 private BufferedWriter writer;
032 private DataFormatInstance dataFormatInstance;
033 private String optionString;
034 private int sentenceCount;
035 private LinkedHashMap<Integer, Integer> nonTerminalIndexMap;
036 private int START_ID_OF_NONTERMINALS = 500;
037 private boolean closeStream = true;
038
039 public NegraWriter() {
040 nonTerminalIndexMap = new LinkedHashMap<Integer, Integer>();
041 }
042
043 public void open(String fileName, String charsetName) throws MaltChainedException {
044 try {
045 open(new OutputStreamWriter(new FileOutputStream(fileName),charsetName));
046 } catch (FileNotFoundException e) {
047 throw new DataFormatException("The output file '"+fileName+"' cannot be found.", e);
048 } catch (UnsupportedEncodingException e) {
049 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e);
050 }
051 }
052
053 public void open(OutputStream os, String charsetName) throws MaltChainedException {
054 try {
055 if (os == System.out || os == System.err) {
056 closeStream = false;
057 }
058 open(new OutputStreamWriter(os, charsetName));
059 } catch (UnsupportedEncodingException e) {
060 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e);
061 }
062 }
063
064 private void open(OutputStreamWriter osw) throws MaltChainedException {
065 setWriter(new BufferedWriter(osw));
066 setSentenceCount(0);
067 }
068
069 public void writeProlog() throws MaltChainedException { }
070
071 public void writeSentence(TokenStructure syntaxGraph) throws MaltChainedException {
072 if (syntaxGraph == null || dataFormatInstance == null || !(syntaxGraph instanceof PhraseStructure) || !syntaxGraph.hasTokens()) {
073 return;
074 }
075 PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph;
076 sentenceCount++;
077 try {
078 writer.write("#BOS ");
079 if (phraseStructure.getSentenceID() != 0) {
080 writer.write(Integer.toString(phraseStructure.getSentenceID()));
081 } else {
082 writer.write(Integer.toString(sentenceCount));
083 }
084 writer.write('\n');
085
086 if (phraseStructure.hasNonTerminals()) {
087 calculateIndices(phraseStructure);
088 writeTerminals(phraseStructure);
089 writeNonTerminals(phraseStructure);
090 } else {
091 writeTerminals(phraseStructure);
092 }
093 writer.write("#EOS ");
094 if (phraseStructure.getSentenceID() != 0) {
095 writer.write(Integer.toString(phraseStructure.getSentenceID()));
096 } else {
097 writer.write(Integer.toString(sentenceCount));
098 }
099 writer.write('\n');
100 } catch (IOException e) {
101 throw new DataFormatException("Could not write to the output file. ", e);
102 }
103 }
104 public void writeEpilog() throws MaltChainedException { }
105
106
107 private void calculateIndices(PhraseStructure phraseStructure) throws MaltChainedException {
108 final SortedMap<Integer,Integer> heights = new TreeMap<Integer,Integer>();
109 for (int index : phraseStructure.getNonTerminalIndices()) {
110 heights.put(index, ((NonTerminalNode)phraseStructure.getNonTerminalNode(index)).getHeight());
111 }
112
113 boolean done = false;
114 int h = 1;
115 int ntid = START_ID_OF_NONTERMINALS;
116 nonTerminalIndexMap.clear();
117 while (!done) {
118 done = true;
119 for (int index : phraseStructure.getNonTerminalIndices()) {
120 if (heights.get(index) == h) {
121 NonTerminalNode nt = (NonTerminalNode)phraseStructure.getNonTerminalNode(index);
122 nonTerminalIndexMap.put(nt.getIndex(), ntid++);
123 // nonTerminalIndexMap.put(nt.getIndex(), nt.getIndex()+START_ID_OF_NONTERMINALS-1);
124 done = false;
125 }
126 }
127 h++;
128 }
129
130 // boolean done = false;
131 // int h = 1;
132 //// int ntid = START_ID_OF_NONTERMINALS;
133 //// nonTerminalIndexMap.clear();
134 // while (!done) {
135 // done = true;
136 // for (int index : phraseStructure.getNonTerminalIndices()) {
137 // if (heights.get(index) == h) {
138 // NonTerminalNode nt = (NonTerminalNode)phraseStructure.getNonTerminalNode(index);
139 //// nonTerminalIndexMap.put(nt.getIndex(), ntid++);
140 // nonTerminalIndexMap.put(nt.getIndex(), nt.getIndex()+START_ID_OF_NONTERMINALS-1);
141 // done = false;
142 // }
143 // }
144 // h++;
145 // }
146 }
147
148 private void writeTerminals(PhraseStructure phraseStructure) throws MaltChainedException {
149 try {
150 for (int index : phraseStructure.getTokenIndices()) {
151 final PhraseStructureNode terminal = phraseStructure.getTokenNode(index);
152 final Iterator<ColumnDescription> columns = dataFormatInstance.iterator();
153 ColumnDescription column = null;
154 int ti = 1;
155 while (columns.hasNext()) {
156 column = columns.next();
157 if (column.getCategory() == ColumnDescription.INPUT) {
158 writer.write(terminal.getLabelSymbol(column.getSymbolTable()));
159 int nTabs = 1;
160 if (ti == 1 || ti == 2) {
161 nTabs = 3 - (terminal.getLabelSymbol(column.getSymbolTable()).length() / 8);
162 } else if (ti == 3) {
163 nTabs = 1;
164 } else if (ti == 4) {
165 nTabs = 2 - (terminal.getLabelSymbol(column.getSymbolTable()).length() / 8);
166 }
167 if (nTabs < 1) {
168 nTabs = 1;
169 }
170 for (int j = 0; j < nTabs; j++) {
171 writer.write('\t');
172 }
173 ti++;
174 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_EDGE_LABEL) {
175 if (terminal.getParent() != null && terminal.hasParentEdgeLabel(column.getSymbolTable())) {
176 writer.write(terminal.getParentEdgeLabelSymbol(column.getSymbolTable()));
177 writer.write('\t');
178 } else {
179 writer.write("--\t");
180 }
181 } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_NODE_LABEL) {
182 if (terminal.getParent() == null || terminal.getParent() == phraseStructure.getPhraseStructureRoot()) {
183 writer.write('0');
184 } else {
185 writer.write(Integer.toString(nonTerminalIndexMap.get(terminal.getParent().getIndex())));
186 // writer.write(Integer.toString(terminal.getParent().getIndex()+START_ID_OF_NONTERMINALS-1));
187 }
188 }
189 }
190 for (Edge e : terminal.getIncomingSecondaryEdges()) {
191 if (e.hasLabel(column.getSymbolTable())) {
192 writer.write('\t');
193 writer.write(e.getLabelSymbol(column.getSymbolTable()));
194 writer.write('\t');
195 if (e.getSource() instanceof NonTerminalNode) {
196 writer.write(Integer.toString(nonTerminalIndexMap.get(e.getSource().getIndex())));
197 // writer.write(Integer.toString(e.getSource().getIndex()+START_ID_OF_NONTERMINALS-1));
198 } else {
199 writer.write(Integer.toString(e.getSource().getIndex()));
200 }
201 }
202 }
203 writer.write("\n");
204 }
205
206 } catch (IOException e) {
207 throw new DataFormatException("The Negra writer is not able to write. ", e);
208 }
209 }
210
211 private void writeNonTerminals(PhraseStructure phraseStructure) throws MaltChainedException {
212 for (int index : nonTerminalIndexMap.keySet()) {
213 // for (int index : phraseStructure.getNonTerminalIndices()) {
214 NonTerminalNode nonTerminal = (NonTerminalNode)phraseStructure.getNonTerminalNode(index);
215
216 if (nonTerminal == null || nonTerminal.isRoot()) {
217 return;
218 }
219 try {
220 writer.write('#');
221 // writer.write(Integer.toString(index+START_ID_OF_NONTERMINALS-1));
222 writer.write(Integer.toString(nonTerminalIndexMap.get(index)));
223 writer.write("\t\t\t--\t\t\t");
224 if (nonTerminal.hasLabel(dataFormatInstance.getColumnDescriptionByName("CAT").getSymbolTable())) {
225 writer.write(nonTerminal.getLabelSymbol(dataFormatInstance.getColumnDescriptionByName("CAT").getSymbolTable()));
226 } else {
227 writer.write("--");
228 }
229 writer.write("\t--\t\t");
230 if (nonTerminal.hasParentEdgeLabel(dataFormatInstance.getColumnDescriptionByName("LABEL").getSymbolTable())) {
231 writer.write(nonTerminal.getParentEdgeLabelSymbol(dataFormatInstance.getColumnDescriptionByName("LABEL").getSymbolTable()));
232 } else {
233 writer.write("--");
234 }
235 writer.write('\t');
236 if (nonTerminal.getParent() == null || nonTerminal.getParent().isRoot()) {
237 writer.write('0');
238 } else {
239 // writer.write(Integer.toString(nonTerminal.getParent().getIndex()+START_ID_OF_NONTERMINALS-1));
240 writer.write(Integer.toString(nonTerminalIndexMap.get(nonTerminal.getParent().getIndex())));
241 }
242 for (Edge e : nonTerminal.getIncomingSecondaryEdges()) {
243 if (e.hasLabel(dataFormatInstance.getColumnDescriptionByName("SECEDGELABEL").getSymbolTable())) {
244 writer.write('\t');
245 writer.write(e.getLabelSymbol(dataFormatInstance.getColumnDescriptionByName("SECEDGELABEL").getSymbolTable()));
246 writer.write('\t');
247 if (e.getSource() instanceof NonTerminalNode) {
248 // writer.write(Integer.toString(e.getSource().getIndex()+START_ID_OF_NONTERMINALS-1));
249 writer.write(Integer.toString(nonTerminalIndexMap.get(e.getSource().getIndex())));
250 } else {
251 writer.write(Integer.toString(e.getSource().getIndex()));
252 }
253 }
254 }
255 writer.write("\n");
256 } catch (IOException e) {
257 throw new DataFormatException("The Negra writer is not able to write the non-terminals. ", e);
258 }
259 }
260 }
261
262 public BufferedWriter getWriter() {
263 return writer;
264 }
265
266 public void setWriter(BufferedWriter writer) {
267 this.writer = writer;
268 }
269
270 public int getSentenceCount() {
271 return sentenceCount;
272 }
273
274 public void setSentenceCount(int sentenceCount) {
275 this.sentenceCount = sentenceCount;
276 }
277
278 public DataFormatInstance getDataFormatInstance() {
279 return dataFormatInstance;
280 }
281
282 public void setDataFormatInstance(DataFormatInstance dataFormatInstance) {
283 this.dataFormatInstance = dataFormatInstance;
284 }
285
286 public String getOptions() {
287 return optionString;
288 }
289
290 public void setOptions(String optionString) throws MaltChainedException {
291 this.optionString = optionString;
292 String[] argv;
293 try {
294 argv = optionString.split("[_\\p{Blank}]");
295 } catch (PatternSyntaxException e) {
296 throw new DataFormatException("Could not split the penn writer option '"+optionString+"'. ", e);
297 }
298 for (int i=0; i < argv.length-1; i++) {
299 if(argv[i].charAt(0) != '-') {
300 throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0));
301 }
302 if(++i>=argv.length) {
303 throw new DataFormatException("The last argument does not have any value. ");
304 }
305 switch(argv[i-1].charAt(1)) {
306 case 's':
307 try {
308 START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]);
309 } catch (NumberFormatException e){
310 throw new MaltChainedException("The TigerXML Reader option -s must be an integer value. ");
311 }
312 break;
313 default:
314 throw new DataFormatException("Unknown svm parameter: '"+argv[i-1]+"' with value '"+argv[i]+"'. ");
315 }
316 }
317 }
318
319 public void close() throws MaltChainedException {
320 try {
321 if (writer != null) {
322 writer.flush();
323 if (closeStream) {
324 writer.close();
325 }
326 writer = null;
327 }
328 } catch (IOException e) {
329 throw new DataFormatException("Could not close the output file. ", e);
330 }
331 }
332 }