158 lines
		
	
	
		
			3.9 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			158 lines
		
	
	
		
			3.9 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
#include <exception>
 | 
						|
#include <iostream>
 | 
						|
#include <lex.hpp>
 | 
						|
#include <string>
 | 
						|
#include <cctype>
 | 
						|
 | 
						|
using namespace std;
 | 
						|
 | 
						|
std::ostream &operator<<(std::ostream &os, Token const &t) { 
 | 
						|
    os << "Token(";
 | 
						|
    switch (t.type) {
 | 
						|
    case TokenType::OpenParen: os << "OpenParen)"; break;
 | 
						|
    case TokenType::CloseParen: os << "CloseParen)"; break;
 | 
						|
    case TokenType::Dollar: os << "Dollar)"; break;
 | 
						|
    case TokenType::Symbol: os << "Symbol, " << get<string>(t.value) << ")"; break;
 | 
						|
    case TokenType::String: os << "String, \"" << get<string>(t.value) << "\")"; break;
 | 
						|
    case TokenType::Int: os << "Int, " << get<int64_t>(t.value) << ")"; break;
 | 
						|
    case TokenType::Double: os << "Double, " << get<double>(t.value) << ")"; break;
 | 
						|
    case TokenType::End: os << "END)"; break;
 | 
						|
    default:
 | 
						|
        os << ")";
 | 
						|
    }
 | 
						|
    return os;
 | 
						|
}
 | 
						|
 | 
						|
bool ispunct(char c) {
 | 
						|
    for (char i : "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") {
 | 
						|
        if (i == c) return true;
 | 
						|
    }
 | 
						|
    return false;
 | 
						|
}
 | 
						|
 | 
						|
bool isSymbolChar(char c) {
 | 
						|
    return c != '(' && c != ')' && isgraph(c);
 | 
						|
}
 | 
						|
 | 
						|
Lexer::Lexer(string s) : ss(s) {}
 | 
						|
Lexer::Lexer() : ss("") {}
 | 
						|
 | 
						|
void Lexer::feed(string s) {
 | 
						|
    ss << s;
 | 
						|
}
 | 
						|
 | 
						|
Token Lexer::lexNumOrSym() {
 | 
						|
    // first we take the part that is either a number or symbol, then
 | 
						|
    // we'll determine which it is.
 | 
						|
    stringstream acc("");
 | 
						|
    while (true) {
 | 
						|
        char c = ss.get();
 | 
						|
        if (ss.eof())
 | 
						|
            break;
 | 
						|
        if (!isSymbolChar(c)) {
 | 
						|
            ss.unget();
 | 
						|
            break;
 | 
						|
        }
 | 
						|
 | 
						|
        acc << c;
 | 
						|
    }
 | 
						|
 | 
						|
    // TODO:  bigint. also reader base.
 | 
						|
    // ... this will almost certainly change, won't it?
 | 
						|
    string s = acc.str();
 | 
						|
    string iterate_over = (s.at(0) == '-') ? s.substr(1) : s;
 | 
						|
    bool is_number = true;
 | 
						|
    bool dot_seen = false;
 | 
						|
    for (char c : s) {
 | 
						|
        if (c == '.') {
 | 
						|
            if (dot_seen) {
 | 
						|
                is_number = false;
 | 
						|
                break;
 | 
						|
            }
 | 
						|
            dot_seen = true;
 | 
						|
            continue;
 | 
						|
        }
 | 
						|
        if (!isdigit(c)) {
 | 
						|
            is_number = false;
 | 
						|
            break;
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    if (is_number && dot_seen) {
 | 
						|
        if (s == ".")
 | 
						|
            return {TokenType::Symbol, s};
 | 
						|
        return {TokenType::Double, stod(s)};
 | 
						|
    } else if (is_number) {
 | 
						|
        return {TokenType::Int, stoll(s)};
 | 
						|
    }
 | 
						|
    return {TokenType::Symbol, s};
 | 
						|
}
 | 
						|
 | 
						|
Token Lexer::lexString() {
 | 
						|
    ss.get(); // skip the quote.
 | 
						|
    stringstream acc("");
 | 
						|
    while (true) {
 | 
						|
        char c = ss.get();
 | 
						|
        if (ss.eof()) {
 | 
						|
            cerr << "EOF while reading string.";
 | 
						|
            throw exception();
 | 
						|
        }
 | 
						|
        if (c == '"')
 | 
						|
            break;
 | 
						|
        acc << c;
 | 
						|
    }
 | 
						|
    return {TokenType::String, acc.str()};
 | 
						|
}
 | 
						|
 | 
						|
Token Lexer::lexNonSpecial() {
 | 
						|
    // This function will not be called unless a character was received.
 | 
						|
    char first = ss.peek();
 | 
						|
    if (first == '"')
 | 
						|
        return lexString();
 | 
						|
    else if (isSymbolChar(first))
 | 
						|
        return lexNumOrSym();
 | 
						|
    else {
 | 
						|
        cerr << "Non-printable character found." << endl;
 | 
						|
        throw std::exception();
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
Token Lexer::next() {
 | 
						|
    while (true) {
 | 
						|
        // we MUST check for eof AFTER trying to get a character.
 | 
						|
        // ss.eof() doesn't return true until we try to get another
 | 
						|
        // character while at EOF, even if we have exhausted the stream.
 | 
						|
        char c = ss.get();
 | 
						|
        if (ss.eof())
 | 
						|
            return {TokenType::End};
 | 
						|
 | 
						|
        if (isspace(c))
 | 
						|
            continue;
 | 
						|
        switch (c) {
 | 
						|
        case '(': return {TokenType::OpenParen};
 | 
						|
        case ')': return {TokenType::CloseParen};
 | 
						|
        case '$': return {TokenType::Dollar};
 | 
						|
        default:
 | 
						|
            ss.unget();
 | 
						|
            return lexNonSpecial();
 | 
						|
        }
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
vector<Token> Lexer::collect() {
 | 
						|
    vector<Token> v;
 | 
						|
    while (true) {
 | 
						|
        Token t = next();
 | 
						|
        if (t.type == TokenType::End)
 | 
						|
            break;
 | 
						|
 | 
						|
        v.push_back(t);
 | 
						|
    }
 | 
						|
    return v;
 | 
						|
}
 | 
						|
 | 
						|
std::vector<Token> lex(std::string s) {
 | 
						|
    Lexer l(s);
 | 
						|
    return l.collect();
 | 
						|
}
 |