4학년 1학기 전공/컴파일러
[컴파일러] Assignment 1. Lexical Analysis
시데브
2025. 4. 12. 14:48
경희대학교 허선영 교수님의 컴파일러 수업을 기반으로 정리한 글입니다.
Goals of this Assignment
- Build a lexical analyzer for a new language, using a lexer generator for OCaml(ocamllex)
- know how to write a regular expression to represent a given set of strings
- Understand how to write and debug ocamllex specification
ocamllex
- ocamllex: lexer generator for OCaml
- ocamllex specification consists of three parts as shown in the example in Snippet 1.
- User Declarations: auxiliary(보조자) values & types 작성
- Line 0 to 5
- Lex Definitions: regular expression 정의
- Line 7
- Format: let ident = regexp ...
- Rules: rules for lexical anlysis 포함
- Line 9 to 11
- Format: rule entrypoint [arg1 ... argn] = parse regexp { action } | ...
- User Declarations: auxiliary(보조자) values & types 작성
- ocamllex provides predefined special operations for lexical analysis
- Lexing.lexeme lexbuf: return the matched string
- Lexing.lexeme_char lexbuf n: return the n-th character in the matched string
- Lexing.lexeme_start lexbuf: return the start position of the matched string
- Lexing.lexeme_end lexbuf: return the end position of the matched string
- in addition, ocammlex allowss to define multiple entry points for rules
- can switch to another entry point by calling entrypoint lexbuf.
The Fun Language
- Fun: new programming language introducedd in this course
- goal of the assignments: to build a compiler for the Fun language
- Specification of the language
- Reserved words: fun, in, let, while, do, if, then, else, ref, not
- Identifiers
- An identifier is a sequence of letters, digits, and underscores, starting with a letter
- Upper-case letters are distinguished from lower-case letters
- Numbers
- Each number is a sequence of digits
- minus sign should be lexed as a separate token
- Other Tokens
- Comments: start with /* and end with */
- Whitespaces: Newlines(\n), carriage-returns(\r), tabs(\t)
Lexical Analyzer for Fun
(* CSE322 Compiler Assignment 1 *)
{
exception Eof
(* create a tuple of the start and end positions *)
let make_pos buf = (Lexing.lexeme_start buf, Lexing.lexeme_end buf)
}
let alpha = ['A'-'Z''a'-'z']
let digit = ['0'-'9']
let projnum = '0' | ['1'-'9'] digit*
let ident = alpha ( alpha | digit | '_')*
rule initial = parse
| "fun" { Tokens.tok_fun(make_pos lexbuf) }
| "->" { Tokens.tok_arrow(make_pos lexbuf) }
| "in" { Tokens.tok_in(make_pos lexbuf) }
| "let" { Tokens.tok_let(make_pos lexbuf) }
| "else" { Tokens.tok_else(make_pos lexbuf) }
| "then" { Tokens.tok_then(make_pos lexbuf) }
| "if" { Tokens.tok_if(make_pos lexbuf) }
| "assign" { Tokens.tok_assign(make_pos lexbuf) }
| "!" { Tokens.tok_bang(make_pos lexbuf) }
| "ref" { Tokens.tok_ref(make_pos lexbuf) }
| "do" { Tokens.tok_do(make_pos lexbuf) }
| "while" { Tokens.tok_while(make_pos lexbuf) }
| "||" { Tokens.tok_or(make_pos lexbuf) }
| "not" { Tokens.tok_not(make_pos lexbuf) }
| "&" { Tokens.tok_and(make_pos lexbuf) }
| ">" { Tokens.tok_gt(make_pos lexbuf) }
| "=" { Tokens.tok_eq(make_pos lexbuf) }
| "<" { Tokens.tok_lt(make_pos lexbuf) }
| "*" { Tokens.tok_times(make_pos lexbuf) }
| "-" { Tokens.tok_minus(make_pos lexbuf) }
| "+" { Tokens.tok_plus(make_pos lexbuf) }
| ")" { Tokens.tok_rparen(make_pos lexbuf) }
| "(" { Tokens.tok_lparen(make_pos lexbuf) }
| ":" { Tokens.tok_colon(make_pos lexbuf) }
| ";" { Tokens.tok_semicolon(make_pos lexbuf) }
| "," { Tokens.tok_comma(make_pos lexbuf) }
| "#" projnum as s { Tokens.tok_proj(make_pos lexbuf, int_of_string (String.sub s 1 (String.length s - 1))) }
| ident as id { Tokens.tok_id(make_pos lexbuf, id) }
| digit+ as num { Tokens.tok_num(make_pos lexbuf, int_of_string num) } (* num 처리 *)
| [' ' '\t' '\r'] { initial lexbuf }
| "\n" { Errormsg.new_line(make_pos lexbuf); initial lexbuf }
| "/*" { comment 1 lexbuf }
| eof { raise Eof }
| _ {
let msg = Printf.sprintf "Unexpected character: %s" (Lexing.lexeme lexbuf) in
Errormsg.impossible msg
}
and comment depth = parse
| "/*" { comment (depth + 1) lexbuf }
| "*/" { if depth = 1 then initial lexbuf else comment (depth - 1) lexbuf }
| eof { Errormsg.impossible "Unclosed comment" }
| _ { comment depth lexbuf }
728x90
반응형