001 package org.maltparser.core.syntaxgraph.reader;
002
003 import java.io.BufferedReader;
004 import java.io.FileInputStream;
005 import java.io.FileNotFoundException;
006 import java.io.IOException;
007 import java.io.InputStream;
008 import java.io.InputStreamReader;
009 import java.io.UnsupportedEncodingException;
010 import java.net.URL;
011 import java.util.Iterator;
012
013 import org.maltparser.core.exception.MaltChainedException;
014 import org.maltparser.core.io.dataformat.ColumnDescription;
015 import org.maltparser.core.io.dataformat.DataFormatException;
016 import org.maltparser.core.io.dataformat.DataFormatInstance;
017 import org.maltparser.core.syntaxgraph.DependencyStructure;
018 import org.maltparser.core.syntaxgraph.Element;
019 import org.maltparser.core.syntaxgraph.TokenStructure;
020 import org.maltparser.core.syntaxgraph.edge.Edge;
021 /**
022 *
023 *
024 * @author Johan Hall
025 */
026 public class TabReader implements SyntaxGraphReader {
027 private BufferedReader reader;
028 private int sentenceCount;
029 private final StringBuilder input;
030 private DataFormatInstance dataFormatInstance;
031 private static final String IGNORE_COLUMN_SIGN = "_";
032 private static final char TAB = '\t';
033 private static final char NEWLINE = '\n';
034 private static final char CARRIAGE_RETURN = '\r';
035 private String fileName = null;
036 private URL url = null;
037 private String charsetName;
038 private int nIterations;
039 private int cIterations;
040 private boolean closeStream = true;
041
042 public TabReader() {
043 input = new StringBuilder();
044 nIterations = 1;
045 cIterations = 1;
046 }
047
048 private void reopen() throws MaltChainedException {
049 close();
050 if (fileName != null) {
051 open(fileName, charsetName);
052 } else if (url != null) {
053 open(url, charsetName);
054 } else {
055 throw new DataFormatException("The input stream cannot be reopen. ");
056 }
057 }
058
059 public void open(String fileName, String charsetName) throws MaltChainedException {
060 setFileName(fileName);
061 setCharsetName(charsetName);
062 try {
063 open(new FileInputStream(fileName), charsetName);
064 } catch (FileNotFoundException e) {
065 throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e);
066 }
067 }
068
069 public void open(URL url, String charsetName) throws MaltChainedException {
070 setUrl(url);
071 setCharsetName(charsetName);
072 if (url == null) {
073 throw new DataFormatException("The input file cannot be found. ");
074 }
075 try {
076 open(url.openStream(), charsetName);
077 } catch (IOException e) {
078 throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e);
079 }
080 }
081
082 public void open(InputStream is, String charsetName) throws MaltChainedException {
083 try {
084 if (is == System.in) {
085 closeStream = false;
086 }
087 open(new InputStreamReader(is, charsetName));
088 } catch (UnsupportedEncodingException e) {
089 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e);
090 }
091 }
092
093 private void open(InputStreamReader isr) throws MaltChainedException {
094 setReader(new BufferedReader(isr));
095 setSentenceCount(0);
096 }
097
098 public void readProlog() throws MaltChainedException {
099
100 }
101
102 public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException {
103 if (syntaxGraph == null || dataFormatInstance == null) {
104 return false;
105 }
106
107 Element node = null;
108 Edge edge = null;
109 input.setLength(0);
110 int i = 0;
111 int terminalCounter = 0;
112 int nNewLines = 0;
113 syntaxGraph.clear();
114 syntaxGraph.getSymbolTables().cleanUp();
115 Iterator<ColumnDescription> columns = dataFormatInstance.iterator();
116 while (true) {
117 int c;
118
119 try {
120 c = reader.read();
121 } catch (IOException e) {
122 close();
123 throw new DataFormatException("Error when reading from the input file. ", e);
124 }
125 if (c == TAB || c == NEWLINE || c == CARRIAGE_RETURN || c == -1) {
126 if (input.length() != 0) {
127 if (i == 0) {
128 terminalCounter++;
129 node = syntaxGraph.addTokenNode(terminalCounter);
130 }
131 if (columns.hasNext()) {
132 ColumnDescription column = columns.next();
133 if (column.getCategory() == ColumnDescription.INPUT && node != null) {
134 syntaxGraph.addLabel(node, column.getName(), input.toString());
135 } else if (column.getCategory() == ColumnDescription.HEAD) {
136 if (syntaxGraph instanceof DependencyStructure) {
137 if (column.getCategory() != ColumnDescription.IGNORE && !input.toString().equals(IGNORE_COLUMN_SIGN)) {
138 // if (column.getType() != ColumnDescription.IGNORE && !input.toString().equals(IGNORE_COLUMN_SIGN)) { // bugfix
139 //if (!input.toString().equals(IGNORE_COLUMN_SIGN)) {
140 edge = ((DependencyStructure)syntaxGraph).addDependencyEdge(Integer.parseInt(input.toString()), terminalCounter);
141 }
142 }
143 else {
144 close();
145 throw new DataFormatException("The input graph is not a dependency graph and therefore it is not possible to add dependncy edges. ");
146 }
147 } else if (column.getCategory() == ColumnDescription.DEPENDENCY_EDGE_LABEL && edge != null) {
148 //if (column.getType() != ColumnDescription.IGNORE && !input.toString().equals(IGNORE_COLUMN_SIGN)) { // bugfix not working for everybody
149 syntaxGraph.addLabel(edge, column.getName(), input.toString());
150 //} // bugfix
151 }
152 }
153 input.setLength(0);
154 nNewLines = 0;
155 i++;
156 } else if (c == TAB) {
157 throw new MaltChainedException("The input file '"+fileName+"' contains a column where the value is an empty string. Please check your input file. ");
158 }
159 if (c == NEWLINE) {
160 nNewLines++;
161 i = 0;
162 columns = dataFormatInstance.iterator();
163 }
164 } else {
165 input.append((char)c);
166 }
167
168 if (nNewLines == 2 && c == NEWLINE) {
169 if (syntaxGraph.hasTokens()) {
170 sentenceCount++;
171 }
172 return true;
173 } else if (c == -1) {
174 if (syntaxGraph.hasTokens()) {
175 sentenceCount++;
176 }
177 if (cIterations < nIterations) {
178 cIterations++;
179 reopen();
180 return true;
181 }
182
183 return false;
184 }
185 }
186 }
187
188 public void readEpilog() throws MaltChainedException {
189
190 }
191
192 public BufferedReader getReader() {
193 return reader;
194 }
195
196 public void setReader(BufferedReader reader) throws MaltChainedException {
197 close();
198 this.reader = reader;
199 }
200
201 public DataFormatInstance getDataFormatInstance() {
202 return dataFormatInstance;
203 }
204
205 public void setDataFormatInstance(DataFormatInstance dataFormatInstance) {
206 this.dataFormatInstance = dataFormatInstance;
207 }
208
209 public int getSentenceCount() throws MaltChainedException {
210 return sentenceCount;
211 }
212
213 public void setSentenceCount(int sentenceCount) {
214 this.sentenceCount = sentenceCount;
215 }
216
217 public String getOptions() {
218 return null;
219 }
220
221 public void setOptions(String optionString) throws MaltChainedException {
222
223 }
224
225 public String getFileName() {
226 return fileName;
227 }
228
229 public void setFileName(String fileName) {
230 this.fileName = fileName;
231 }
232
233 public URL getUrl() {
234 return url;
235 }
236
237 public void setUrl(URL url) {
238 this.url = url;
239 }
240
241 public String getCharsetName() {
242 return charsetName;
243 }
244
245 public void setCharsetName(String charsetName) {
246 this.charsetName = charsetName;
247 }
248
249 public int getNIterations() {
250 return nIterations;
251 }
252
253 public void setNIterations(int iterations) {
254 nIterations = iterations;
255 }
256
257 public int getIterationCounter() {
258 return cIterations;
259 }
260
261 public void close() throws MaltChainedException {
262 try {
263 if (reader != null) {
264 if (closeStream) {
265 reader.close();
266 }
267 reader = null;
268 }
269 } catch (IOException e) {
270 throw new DataFormatException("Error when closing the input file. ", e);
271 }
272 }
273
274 public void clear() throws MaltChainedException {
275 close();
276 input.setLength(0);
277 dataFormatInstance = null;
278 sentenceCount = 0;
279 }
280 }