001 package org.maltparser.core.syntaxgraph.writer;
002
003 import java.io.BufferedWriter;
004 import java.io.FileNotFoundException;
005 import java.io.FileOutputStream;
006 import java.io.IOException;
007 import java.io.OutputStream;
008 import java.io.OutputStreamWriter;
009 import java.io.UnsupportedEncodingException;
010 import java.util.SortedMap;
011 import java.util.regex.PatternSyntaxException;
012
013 import org.maltparser.core.exception.MaltChainedException;
014 import org.maltparser.core.io.dataformat.ColumnDescription;
015 import org.maltparser.core.io.dataformat.DataFormatException;
016 import org.maltparser.core.io.dataformat.DataFormatInstance;
017 import org.maltparser.core.symbol.SymbolTable;
018 import org.maltparser.core.syntaxgraph.PhraseStructure;
019 import org.maltparser.core.syntaxgraph.TokenStructure;
020 import org.maltparser.core.syntaxgraph.node.NonTerminalNode;
021 import org.maltparser.core.syntaxgraph.node.PhraseStructureNode;
022 import org.maltparser.core.syntaxgraph.node.TokenNode;
023 /**
024 *
025 *
026 * @author Johan Hall
027 */
028 public class BracketWriter implements SyntaxGraphWriter {
029 private enum PennWriterFormat {
030 DEFAULT, PRETTY
031 };
032 private PennWriterFormat format;
033 private BufferedWriter writer;
034 private DataFormatInstance dataFormatInstance;
035 private SortedMap<String,ColumnDescription> inputColumns;
036 private SortedMap<String,ColumnDescription> edgeLabelColumns;
037 private SortedMap<String,ColumnDescription> phraseLabelColumns;
038 private char STARTING_BRACKET = '(';
039 private String EMPTY_EDGELABEL = "??";
040 private char CLOSING_BRACKET = ')';
041 private char INPUT_SEPARATOR = ' ';
042 private char EDGELABEL_SEPARATOR = '-';
043 private char SENTENCE_SEPARATOR = '\n';
044 private String optionString;
045 private boolean closeStream = true;
046
047 public BracketWriter() {
048 }
049
050 public void open(String fileName, String charsetName) throws MaltChainedException {
051 try {
052 open(new OutputStreamWriter(new FileOutputStream(fileName),charsetName));
053 } catch (FileNotFoundException e) {
054 throw new DataFormatException("The output file '"+fileName+"' cannot be found.", e);
055 } catch (UnsupportedEncodingException e) {
056 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e);
057 }
058 }
059
060 public void open(OutputStream os, String charsetName) throws MaltChainedException {
061 try {
062 if (os == System.out || os == System.err) {
063 closeStream = false;
064 }
065 open(new OutputStreamWriter(os, charsetName));
066 } catch (UnsupportedEncodingException e) {
067 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e);
068 }
069 }
070
071 private void open(OutputStreamWriter osw) throws MaltChainedException {
072 setWriter(new BufferedWriter(osw));
073 }
074
075 public void writeEpilog() throws MaltChainedException {
076
077 }
078
079 public void writeProlog() throws MaltChainedException {
080
081 }
082
083 public void writeSentence(TokenStructure syntaxGraph) throws MaltChainedException {
084 if (syntaxGraph == null || dataFormatInstance == null) {
085 return;
086 }
087 if (syntaxGraph instanceof PhraseStructure && syntaxGraph.hasTokens()) {
088 // PhraseStructure phraseStructure = ((PhraseStructure) syntaxGraph);
089 if (format == PennWriterFormat.PRETTY) {
090 writeElement(((PhraseStructure) syntaxGraph).getPhraseStructureRoot(), 0);
091 } else {
092 writeElement(((PhraseStructure) syntaxGraph).getPhraseStructureRoot());
093 }
094 try {
095 writer.write(SENTENCE_SEPARATOR);
096 writer.flush();
097 } catch (IOException e) {
098 close();
099 throw new DataFormatException("Could not write to the output file. ", e);
100 }
101 }
102 }
103
104 private void writeElement(PhraseStructureNode element) throws MaltChainedException {
105 try {
106 if (element instanceof TokenNode) {
107 PhraseStructureNode t = (PhraseStructureNode)element;
108 SymbolTable table = null;
109 writer.write(STARTING_BRACKET);
110 int i = 0;
111 for (String inputColumn : inputColumns.keySet()) {
112 if (i != 0) {
113 writer.write(INPUT_SEPARATOR);
114 }
115 table = inputColumns.get(inputColumn).getSymbolTable();
116 if (t.hasLabel(table)) {
117 writer.write(t.getLabelSymbol(table));
118 }
119 if (i == 0) {
120 for (String edgeLabelColumn : edgeLabelColumns.keySet()) {
121 table = edgeLabelColumns.get(edgeLabelColumn).getSymbolTable();
122 if (t.hasParentEdgeLabel(table) && !t.getParent().isRoot() && !t.getParentEdgeLabelSymbol(table).equals(EMPTY_EDGELABEL)) {
123 writer.write(EDGELABEL_SEPARATOR);
124 writer.write(t.getParentEdgeLabelSymbol(table));
125 }
126 }
127 }
128 i++;
129 }
130 writer.write(CLOSING_BRACKET);
131 } else {
132 NonTerminalNode nt = (NonTerminalNode)element;
133 writer.write(STARTING_BRACKET);
134 SymbolTable table = null;
135 int i = 0;
136 for (String phraseLabelColumn : phraseLabelColumns.keySet()) {
137 if (i != 0) {
138 writer.write(INPUT_SEPARATOR);
139 }
140 table = phraseLabelColumns.get(phraseLabelColumn).getSymbolTable();
141 if (nt.hasLabel(table)) {
142 writer.write(nt.getLabelSymbol(table));
143 }
144 if (i == 0) {
145 for (String edgeLabelColumn : edgeLabelColumns.keySet()) {
146 table = edgeLabelColumns.get(edgeLabelColumn).getSymbolTable();
147 if (nt.hasParentEdgeLabel(table) && !nt.getParent().isRoot() && !nt.getParentEdgeLabelSymbol(table).equals(EMPTY_EDGELABEL)) {
148 writer.write(EDGELABEL_SEPARATOR);
149 writer.write(nt.getParentEdgeLabelSymbol(table));
150 }
151 }
152 }
153 i++;
154 }
155 for (PhraseStructureNode node : ((NonTerminalNode)element).getChildren()) {
156 writeElement(node);
157 }
158 writer.write(CLOSING_BRACKET);
159 }
160 } catch (IOException e) {
161 throw new DataFormatException("Could not write to the output file. ", e);
162 }
163 }
164
165 private String getIndentation(int depth) {
166 StringBuilder sb = new StringBuilder("");
167 for (int i = 0; i < depth; i++) {
168 sb.append("\t");
169 }
170 return sb.toString();
171 }
172
173 private void writeElement(PhraseStructureNode element, int depth) throws MaltChainedException {
174 try {
175 if (element instanceof TokenNode) {
176 PhraseStructureNode t = (PhraseStructureNode)element;
177 SymbolTable table = null;
178 writer.write("\n" + getIndentation(depth) + STARTING_BRACKET);
179 int i = 0;
180 for (String inputColumn : inputColumns.keySet()) {
181 if (i != 0) {
182 writer.write(INPUT_SEPARATOR);
183 }
184 table = inputColumns.get(inputColumn).getSymbolTable();
185 if (t.hasLabel(table)) {
186 writer.write(encodeString(t.getLabelSymbol(table)));
187 }
188 if (i == 0) {
189 for (String edgeLabelColumn : edgeLabelColumns.keySet()) {
190 table = edgeLabelColumns.get(edgeLabelColumn).getSymbolTable();
191 if (t.hasParentEdgeLabel(table) && !t.getParent().isRoot() && !t.getParentEdgeLabelSymbol(table).equals(EMPTY_EDGELABEL)) {
192 writer.write(EDGELABEL_SEPARATOR);
193 writer.write(t.getParentEdgeLabelSymbol(table));
194 }
195 }
196 }
197 i++;
198 }
199 writer.write(CLOSING_BRACKET);
200 } else {
201 NonTerminalNode nt = (NonTerminalNode)element;
202 writer.write("\n" + getIndentation(depth) + STARTING_BRACKET);
203 SymbolTable table = null;
204 int i = 0;
205 for (String phraseLabelColumn : phraseLabelColumns.keySet()) {
206 if (i != 0) {
207 writer.write(INPUT_SEPARATOR);
208 }
209 table = phraseLabelColumns.get(phraseLabelColumn).getSymbolTable();
210 if (nt.hasLabel(table)) {
211 writer.write(nt.getLabelSymbol(table));
212 }
213 if (i == 0) {
214 for (String edgeLabelColumn : edgeLabelColumns.keySet()) {
215 table = edgeLabelColumns.get(edgeLabelColumn).getSymbolTable();
216 if (nt.hasParentEdgeLabel(table) && !nt.getParent().isRoot() && !nt.getParentEdgeLabelSymbol(table).equals(EMPTY_EDGELABEL)) {
217 writer.write(EDGELABEL_SEPARATOR);
218 writer.write(nt.getParentEdgeLabelSymbol(table));
219 }
220 }
221 }
222 i++;
223 }
224 for (PhraseStructureNode node : ((NonTerminalNode)element).getChildren()) {
225 writeElement(node, depth + 1);
226 }
227 writer.write("\n" + getIndentation(depth) + CLOSING_BRACKET);
228 }
229 } catch (IOException e) {
230 throw new DataFormatException("Could not write to the output file. ", e);
231 }
232 }
233
234 public BufferedWriter getWriter() {
235 return writer;
236 }
237
238 public void setWriter(BufferedWriter writer) throws MaltChainedException {
239 close();
240 this.writer = writer;
241 }
242
243 public DataFormatInstance getDataFormatInstance() {
244 return dataFormatInstance;
245 }
246
247 public void setDataFormatInstance(DataFormatInstance dataFormatInstance) {
248 this.dataFormatInstance = dataFormatInstance;
249 inputColumns = dataFormatInstance.getInputColumnDescriptions();
250 edgeLabelColumns = dataFormatInstance.getPhraseStructureEdgeLabelColumnDescriptions();
251 phraseLabelColumns = dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptions();
252 }
253
254 public String getOptions() {
255 return optionString;
256 }
257
258 public void setOptions(String optionString) throws MaltChainedException {
259 this.optionString = optionString;
260 format = PennWriterFormat.DEFAULT;
261
262 String[] argv;
263 try {
264 argv = optionString.split("[_\\p{Blank}]");
265 } catch (PatternSyntaxException e) {
266 throw new DataFormatException("Could not split the bracket writer option '"+optionString+"'. ", e);
267 }
268 for (int i=0; i < argv.length-1; i++) {
269 if(argv[i].charAt(0) != '-') {
270 throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0));
271 }
272 if(++i>=argv.length) {
273 throw new DataFormatException("The last argument does not have any value. ");
274 }
275 switch(argv[i-1].charAt(1)) {
276 case 'f':
277 if (argv[i].equals("p")) {
278 format = PennWriterFormat.PRETTY;
279 } else if (argv[i].equals("p")) {
280 format = PennWriterFormat.DEFAULT;
281 }
282 break;
283 default:
284 throw new DataFormatException("Unknown bracket writer option: '"+argv[i-1]+"' with value '"+argv[i]+"'. ");
285 }
286 }
287 }
288
289 public void close() throws MaltChainedException {
290 try {
291 if (writer != null) {
292 writer.flush();
293 if (closeStream) {
294 writer.close();
295 }
296 writer = null;
297 }
298 } catch (IOException e) {
299 throw new DataFormatException("Could not close the output file. ", e);
300 }
301 }
302
303 private String encodeString(String string) {
304 return string.replace("(", "-LRB-").replace(")", "-RRB-").replace("[", "-LSB-").replace("]", "-RSB-").replace("{", "-LCB-").replace("}", "-RCB-");
305 }
306 }