Skip to content

Package: HeaderTokenizer$Token

HeaderTokenizer$Token

nameinstructionbranchcomplexitylinemethod
HeaderTokenizer.Token(int, String)
M: 0 C: 9
100%
M: 0 C: 0
100%
M: 0 C: 1
100%
M: 0 C: 4
100%
M: 0 C: 1
100%
getType()
M: 0 C: 3
100%
M: 0 C: 0
100%
M: 0 C: 1
100%
M: 0 C: 1
100%
M: 0 C: 1
100%
getValue()
M: 0 C: 3
100%
M: 0 C: 0
100%
M: 0 C: 1
100%
M: 0 C: 1
100%
M: 0 C: 1
100%

Coverage

1: /*
2: * Copyright (c) 1997, 2023 Oracle and/or its affiliates. All rights reserved.
3: *
4: * This program and the accompanying materials are made available under the
5: * terms of the Eclipse Public License v. 2.0, which is available at
6: * http://www.eclipse.org/legal/epl-2.0.
7: *
8: * This Source Code may also be made available under the following Secondary
9: * Licenses when the conditions for such availability set forth in the
10: * Eclipse Public License v. 2.0 are satisfied: GNU General Public License,
11: * version 2 with the GNU Classpath Exception, which is available at
12: * https://www.gnu.org/software/classpath/license.html.
13: *
14: * SPDX-License-Identifier: EPL-2.0 OR GPL-2.0 WITH Classpath-exception-2.0
15: */
16:
17: package jakarta.mail.internet;
18:
19: import java.util.*;
20:
21: /**
22: * This class tokenizes RFC822 and MIME headers into the basic
23: * symbols specified by RFC822 and MIME. <p>
24: *
25: * This class handles folded headers (ie headers with embedded
26: * CRLF SPACE sequences). The folds are removed in the returned
27: * tokens.
28: *
29: * @author John Mani
30: * @author Bill Shannon
31: */
32:
33: public class HeaderTokenizer {
34:
35: /**
36: * The Token class represents tokens returned by the
37: * HeaderTokenizer.
38: */
39: public static class Token {
40:
41:         private int type;
42:         private String value;
43:
44:         /**
45:          * Token type indicating an ATOM.
46:          */
47:         public static final int ATOM                 = -1;
48:
49:         /**
50:          * Token type indicating a quoted string. The value
51:          * field contains the string without the quotes.
52:          */
53:         public static final int QUOTEDSTRING         = -2;
54:
55:         /**
56:          * Token type indicating a comment. The value field
57:          * contains the comment string without the comment
58:          * start and end symbols.
59:          */
60:         public static final int COMMENT                = -3;
61:
62:         /**
63:          * Token type indicating end of input.
64:          */
65:         public static final int EOF                 = -4;
66:
67:         /**
68:          * Constructor.
69:          * @param        type        Token type
70:          * @param        value        Token value
71:          */
72:         public Token(int type, String value) {
73:          this.type = type;
74:          this.value = value;
75:         }
76:
77:         /**
78:          * Return the type of the token. If the token represents a
79:          * delimiter or a control character, the type is that character
80:          * itself, converted to an integer. Otherwise, it's value is
81:          * one of the following:
82:          * <ul>
83:          * <li><code>ATOM</code> A sequence of ASCII characters
84:          *        delimited by either SPACE, CTL, "(", <"> or the
85:          *        specified SPECIALS
86:          * <li><code>QUOTEDSTRING</code> A sequence of ASCII characters
87:          *        within quotes
88:          * <li><code>COMMENT</code> A sequence of ASCII characters
89:          *        within "(" and ")".
90:          * <li><code>EOF</code> End of header
91:          * </ul>
92:          *
93:          * @return        the token type
94:          */
95:         public int getType() {
96:          return type;
97:         }
98:
99:         /**
100:          * Returns the value of the token just read. When the current
101:          * token is a quoted string, this field contains the body of the
102:          * string, without the quotes. When the current token is a comment,
103:          * this field contains the body of the comment.
104:          *
105:          * @return        token value
106:          */
107:         public String getValue() {
108:          return value;
109:         }
110: }
111:
112: private String string; // the string to be tokenized
113: private boolean skipComments; // should comments be skipped ?
114: private String delimiters; // delimiter string
115: private int currentPos; // current parse position
116: private int maxPos; // string length
117: private int nextPos; // track start of next Token for next()
118: private int peekPos; // track start of next Token for peek()
119:
120: /**
121: * RFC822 specials
122: */
123: public final static String RFC822 = "()<>@,;:\\\"\t .[]";
124:
125: /**
126: * MIME specials
127: */
128: public final static String MIME = "()<>@,;:\\\"\t []/?=";
129:
130: // The EOF Token
131: private final static Token EOFToken = new Token(Token.EOF, null);
132:
133: /**
134: * Constructor that takes a rfc822 style header.
135: *
136: * @param        header        The rfc822 header to be tokenized
137: * @param        delimiters Set of delimiter characters
138: *                                to be used to delimit ATOMS. These
139: *                                are usually <code>RFC822</code> or
140: *                                <code>MIME</code>
141: * @param skipComments If true, comments are skipped and
142: *                                not returned as tokens
143: */
144: public HeaderTokenizer(String header, String delimiters,
145:                          boolean skipComments) {
146:         string = (header == null) ? "" : header; // paranoia ?!
147:         this.skipComments = skipComments;
148:         this.delimiters = delimiters;
149:         currentPos = nextPos = peekPos = 0;
150:         maxPos = string.length();
151: }
152:
153: /**
154: * Constructor. Comments are ignored and not returned as tokens
155: *
156: * @param        header The header that is tokenized
157: * @param        delimiters The delimiters to be used
158: */
159: public HeaderTokenizer(String header, String delimiters) {
160:         this(header, delimiters, true);
161: }
162:
163: /**
164: * Constructor. The RFC822 defined delimiters - RFC822 - are
165: * used to delimit ATOMS. Also comments are skipped and not
166: * returned as tokens
167: *
168: * @param        header        the header string
169: */
170: public HeaderTokenizer(String header) {
171:         this(header, RFC822);
172: }
173:
174: /**
175: * Parses the next token from this String. <p>
176: *
177: * Clients sit in a loop calling next() to parse successive
178: * tokens until an EOF Token is returned.
179: *
180: * @return                the next Token
181: * @exception        ParseException if the parse fails
182: */
183: public Token next() throws ParseException {
184:         return next('\0', false);
185: }
186:
187: /**
188: * Parses the next token from this String.
189: * If endOfAtom is not NUL, the token extends until the
190: * endOfAtom character is seen, or to the end of the header.
191: * This method is useful when parsing headers that don't
192: * obey the MIME specification, e.g., by failing to quote
193: * parameter values that contain spaces.
194: *
195: * @param        endOfAtom        if not NUL, character marking end of token
196: * @return                the next Token
197: * @exception        ParseException if the parse fails
198: * @since                JavaMail 1.5
199: */
200: public Token next(char endOfAtom) throws ParseException {
201:         return next(endOfAtom, false);
202: }
203:
204: /**
205: * Parses the next token from this String.
206: * endOfAtom is handled as above. If keepEscapes is true,
207: * any backslash escapes are preserved in the returned string.
208: * This method is useful when parsing headers that don't
209: * obey the MIME specification, e.g., by failing to escape
210: * backslashes in the filename parameter.
211: *
212: * @param        endOfAtom        if not NUL, character marking end of token
213: * @param        keepEscapes        keep all backslashes in returned string?
214: * @return                the next Token
215: * @exception        ParseException if the parse fails
216: * @since                JavaMail 1.5
217: */
218: public Token next(char endOfAtom, boolean keepEscapes)
219:                                 throws ParseException {
220:         Token tk;
221:
222:         currentPos = nextPos; // setup currentPos
223:         tk = getNext(endOfAtom, keepEscapes);
224:         nextPos = peekPos = currentPos; // update currentPos and peekPos
225:         return tk;
226: }
227:
228: /**
229: * Peek at the next token, without actually removing the token
230: * from the parse stream. Invoking this method multiple times
231: * will return successive tokens, until <code>next()</code> is
232: * called.
233: *
234: * @return                the next Token
235: * @exception        ParseException if the parse fails
236: */
237: public Token peek() throws ParseException {
238:         Token tk;
239:
240:         currentPos = peekPos; // setup currentPos
241:         tk = getNext('\0', false);
242:         peekPos = currentPos; // update peekPos
243:         return tk;
244: }
245:
246: /**
247: * Return the rest of the Header.
248: *
249: * @return String        rest of header. null is returned if we are
250: *                        already at end of header
251: */
252: public String getRemainder() {
253:         if (nextPos >= string.length())
254:          return null;
255:         return string.substring(nextPos);
256: }
257:
258: /*
259: * Return the next token starting from 'currentPos'. After the
260: * parse, 'currentPos' is updated to point to the start of the
261: * next token.
262: */
263: private Token getNext(char endOfAtom, boolean keepEscapes)
264:                                 throws ParseException {
265:         // If we're already at end of string, return EOF
266:         if (currentPos >= maxPos)
267:          return EOFToken;
268:
269:         // Skip white-space, position currentPos beyond the space
270:         if (skipWhiteSpace() == Token.EOF)
271:          return EOFToken;
272:
273:         char c;
274:         int start;
275:         boolean filter = false;
276:         
277:         c = string.charAt(currentPos);
278:
279:         // Check or Skip comments and position currentPos
280:         // beyond the comment
281:         while (c == '(') {
282:          // Parsing comment ..
283:          int nesting;
284:          for (start = ++currentPos, nesting = 1;
285:                  nesting > 0 && currentPos < maxPos;
286:                  currentPos++) {
287:                 c = string.charAt(currentPos);
288:                 if (c == '\\') { // Escape sequence
289:                  currentPos++; // skip the escaped character
290:                  filter = true;
291:                 } else if (c == '\r')
292:                  filter = true;
293:                 else if (c == '(')
294:                  nesting++;
295:                 else if (c == ')')
296:                  nesting--;
297:          }
298:          if (nesting != 0)
299:                 throw new ParseException("Unbalanced comments");
300:
301:          if (!skipComments) {
302:                 // Return the comment, if we are asked to.
303:                 // Note that the comment start & end markers are ignored.
304:                 String s;
305:                 if (filter) // need to go thru the token again.
306:                  s = filterToken(string, start, currentPos-1, keepEscapes);
307:                 else
308:                  s = string.substring(start,currentPos-1);
309:
310:                 return new Token(Token.COMMENT, s);
311:          }
312:
313:          // Skip any whitespace after the comment.
314:          if (skipWhiteSpace() == Token.EOF)
315:                 return EOFToken;
316:          c = string.charAt(currentPos);
317:         }
318:
319:         // Check for quoted-string and position currentPos
320:         // beyond the terminating quote
321:         if (c == '"') {
322:          currentPos++;        // skip initial quote
323:          return collectString('"', keepEscapes);
324:         }
325:         
326:         // Check for SPECIAL or CTL
327:         if (c < 040 || c >= 0177 || delimiters.indexOf(c) >= 0) {
328:          if (endOfAtom > 0 && c != endOfAtom) {
329:                 // not expecting a special character here,
330:                 // pretend it's a quoted string
331:                 return collectString(endOfAtom, keepEscapes);
332:          }
333:          currentPos++; // re-position currentPos
334:          char ch[] = new char[1];
335:          ch[0] = c;
336:          return new Token((int)c, new String(ch));
337:         }
338:
339:         // Check for ATOM
340:         for (start = currentPos; currentPos < maxPos; currentPos++) {
341:          c = string.charAt(currentPos);
342:          // ATOM is delimited by either SPACE, CTL, "(", <">
343:          // or the specified SPECIALS
344:          if (c < 040 || c >= 0177 || c == '(' || c == ' ' ||
345:                         c == '"' || delimiters.indexOf(c) >= 0) {
346:                 if (endOfAtom > 0 && c != endOfAtom) {
347:                  // not the expected atom after all;
348:                  // back up and pretend it's a quoted string
349:                  currentPos = start;
350:                  return collectString(endOfAtom, keepEscapes);
351:                 }
352:                 break;
353:          }
354:         }
355:         return new Token(Token.ATOM, string.substring(start, currentPos));
356: }
357:
358: private Token collectString(char eos, boolean keepEscapes)
359:                                 throws ParseException {
360:         int start;
361:         boolean filter = false;
362:         for (start = currentPos; currentPos < maxPos; currentPos++) {
363:          char c = string.charAt(currentPos);
364:          if (c == '\\') { // Escape sequence
365:                 currentPos++;
366:                 filter = true;
367:          } else if (c == '\r')
368:                 filter = true;
369:          else if (c == eos) {
370:                 currentPos++;
371:                 String s;
372:
373:                 if (filter)
374:                  s = filterToken(string, start, currentPos-1, keepEscapes);
375:                 else
376:                  s = string.substring(start, currentPos-1);
377:
378:                 if (c != '"') {                // not a real quoted string
379:                  s = trimWhiteSpace(s);
380:                  currentPos--;        // back up before the eos char
381:                 }
382:
383:                 return new Token(Token.QUOTEDSTRING, s);
384:          }
385:         }
386:
387:         // ran off the end of the string
388:
389:         // if we're looking for a matching quote, that's an error
390:         if (eos == '"')
391:          throw new ParseException("Unbalanced quoted string");
392:
393:         // otherwise, just return whatever's left
394:         String s;
395:         if (filter)
396:          s = filterToken(string, start, currentPos, keepEscapes);
397:         else
398:          s = string.substring(start, currentPos);
399:         s = trimWhiteSpace(s);
400:         return new Token(Token.QUOTEDSTRING, s);
401: }
402:
403: // Skip SPACE, HT, CR and NL
404: private int skipWhiteSpace() {
405:         char c;
406:         for (; currentPos < maxPos; currentPos++)
407:          if (((c = string.charAt(currentPos)) != ' ') &&
408:                 (c != '\t') && (c != '\r') && (c != '\n'))
409:                 return currentPos;
410:         return Token.EOF;
411: }
412:
413: // Trim SPACE, HT, CR and NL from end of string
414: private static String trimWhiteSpace(String s) {
415:         char c;
416:         int i;
417:         for (i = s.length() - 1; i >= 0; i--) {
418:          if (((c = s.charAt(i)) != ' ') &&
419:                 (c != '\t') && (c != '\r') && (c != '\n'))
420:                 break;
421:         }
422:         if (i <= 0)
423:          return "";
424:         else
425:          return s.substring(0, i + 1);
426: }
427:
428: /* Process escape sequences and embedded LWSPs from a comment or
429: * quoted string.
430: */
431: private static String filterToken(String s, int start, int end,
432:                                 boolean keepEscapes) {
433:         StringBuilder sb = new StringBuilder();
434:         char c;
435:         boolean gotEscape = false;
436:         boolean gotCR = false;
437:
438:         for (int i = start; i < end; i++) {
439:          c = s.charAt(i);
440:          if (c == '\n' && gotCR) {
441:                 // This LF is part of an unescaped
442:                 // CRLF sequence (i.e, LWSP). Skip it.
443:                 gotCR = false;
444:                 continue;
445:          }
446:
447:          gotCR = false;
448:          if (!gotEscape) {
449:                 // Previous character was NOT '\'
450:                 if (c == '\\') // skip this character
451:                  gotEscape = true;
452:                 else if (c == '\r') // skip this character
453:                  gotCR = true;
454:                 else // append this character
455:                  sb.append(c);
456:          } else {
457:                 // Previous character was '\'. So no need to
458:                 // bother with any special processing, just
459:                 // append this character. If keepEscapes is
460:                 // set, keep the backslash. IE6 fails to escape
461:                 // backslashes in quoted strings in HTTP headers,
462:                 // e.g., in the filename parameter.
463:                 if (keepEscapes)
464:                  sb.append('\\');
465:                 sb.append(c);
466:                 gotEscape = false;
467:          }
468:         }
469:         return sb.toString();
470: }
471: }