#!/usr/bin/python # tokenizer.py - Functions for tokenizing the input stream. import string, sys import token sys.setrecursionlimit(200000) SPECIAL_CHARACTERS = "()<>[]{}/%" BLOCK_START = "(<[{" WHITE_SPACE = "\x00\t\x0a\x0c\x0d " NEWLINE = "\x0a\x0c\x0d" HEX = "0123456789abcdefABCDEF" ZERO_PAD = "0000000000000000000000000000000000000000000000000000000000000000" SHORT_ZERO_PAD = "00000000000000000000000000000000" class Tokenizer: def __init__(self, instream): self.instream = instream self.mode = "name" self.last_char = "" self.last_token = None self.depth = 0 def __iter__(self): return self def _nextChar(self): self.last_char = self.instream.read(1) return self.last_char def _nextLine(self): line = self._nextChar() while (not (self.last_char in NEWLINE)): line += self._nextChar() return line def nextType1(self): self.mode = "name" current_token = token.Token(data_type="type1") count = 0 short_count = 0 while ((count < 8) and (short_count < 16)): line = self._nextLine() if (line == ""): raise StopIteration current_token.append(line) if (ZERO_PAD in line): count += 1 elif (SHORT_ZERO_PAD in line): short_count += 1 elif (count): count = 0 short_count = 0 self.last_char = line[-1] return current_token def next(self): if (self.mode == "type1"): return self.nextType1() if (self.mode == "EOF"): raise StopIteration if ((self.last_token != None) and (self.last_token.data_type == "operator") and (self.mode != "operator")): if (self.last_token.name == "("): self.mode = "string" elif (self.last_token.name == "<"): self.mode = "hex" elif (self.last_token.name == "<~"): self.mode = "base85" else: self.mode = "name" current_token = token.Token(data_type=self.mode) if (self.mode == "operator"): current_token.append(self.last_char) character = self._nextChar() if (self.last_char == ""): character = self._nextChar() else: character = self.last_char self.mode = "name" while (character != ""): if (not current_token.isValid(character)): if (current_token.data_type == "base85"): self.mode = "operator" if ((len(current_token.name) > 0) or ((current_token.data_type == "string") or (current_token.data_type == "hex") or (current_token.data_type == "base85")) or ((self.last_token != None) and (self.last_token.data_type == "operator") and ((self.last_token.name == "/") or (self.last_token.name == "//")))): self.last_token = current_token return current_token if character == "%": current_token.data_type = "comment" while ((character != "") and (character not in NEWLINE)): current_token.append(character) character = self._nextChar() return current_token if (character in SPECIAL_CHARACTERS): self.mode = "operator" break character = self._nextChar() break else: current_token.append(character) character = self._nextChar() if (character == ""): self.mode = "EOF" if (len(current_token.name) > 0): self.last_token = current_token return current_token return self.next() def __next__(self): return self.next()