COMPILER ENGINEERING
ASSIGNMENT: LEX PROGRAM
BY Yogasimman.R
IT
2022115125
Problem Statement: Lexical Analyzer for C-like Language
Objective:
Create a lexical analyzer (lexer) for a simplified C-like programming language. The lexer will read
source code and break it into its constituent tokens, classifying them based on their types. The
program should be able to identify keywords, operators, identifiers, literals (integer, float, string,
character), comments, punctuation marks, and handle invalid tokens.
Input:
The program takes as input a text file or standard input that contains source code written in a
simplified C-like language. The input can contain the following:
1. Keywords: int, float, double, char, void, if, else, for, while, return,
break, continue, struct, union, typedef, enum, switch, case, default,
const, static, extern.
2. Identifiers: Any valid variable or function names (e.g., main, sum, x, temp_var).
3. Operators: +, -, *, /, ++, --, ==, !=, <=, >=, &&, ||, &, |, ^, ~, <<, >>, =, +=, -=, *=,
/=.
4. Literals: Integer literals (e.g., 123, 0b1010, 0x1A), float literals (e.g., 3.14, 2.71e-3),
character literals (e.g., 'a'), and string literals (e.g., "hello").
5. Comments: Single-line comments starting with // and multi-line comments enclosed by
/* ... */.
6. Punctuation Marks: Parentheses (), braces {}, brackets [], semicolons ;, and commas
,.
Lex Code:
%{
#include <stdio.h>
#include <string.h>
#include <ctype.h>
int line_number = 1;
int keyword_count = 0;
int identifier_count = 0;
int operator_count = 0;
int compound_operator_count = 0;
int ternary_operator_count = 0;
int literal_count = 0;
int complex_literal_count = 0;
int comment_count = 0;
int punctuation_count = 0;
int nested_structure_count = 0;
int invalid_token_count = 0;
char *keywords[] = {
"int", "float", "double", "char", "void", "if", "else", "for", "while", "return",
"break", "continue", "struct", "union", "typedef", "enum", "switch", "case",
"default", "const", "static", "extern", NULL
};
int is_keyword(char *str) {
for (int i = 0; keywords[i] != NULL; i++) {
if (strcmp(str, keywords[i]) == 0)
return 1;
}
return 0;
}
void print_token(char *type, char *value) {
printf("%s (%s) at line %d\n", type, value, line_number);
}
%}
%option noyywrap
%%
\n { line_number++; }
[ \t\r]+ {}
"/\\*"([^*]|[*]+[^/])*"\\*/" {
print_token("Multi-line Comment", yytext);
comment_count++;
}
"//".* { print_token("Single-line Comment", yytext); comment_count++; }
[a-zA-Z_][a-zA-Z0-9_]* {
if (is_keyword(yytext)) {
print_token("Keyword", yytext);
keyword_count++;
} else {
print_token("Identifier", yytext);
identifier_count++;
}
}
"\\+\\+|--" { print_token("Increment/Decrement Operator", yytext);
compound_operator_count++; }
"==|!=|<=|>=|&&|\\|\\|" { print_token("Logical/Relational Operator", yytext); operator_count++; }
"\\+=|-=|\\*=|/=" { print_token("Compound Assignment Operator", yytext);
compound_operator_count++; }
"\\?|:" { print_token("Ternary Operator", yytext); ternary_operator_count++; }
"&|\\||\\^|~|<<|>>" { print_token("Bitwise Operator", yytext); operator_count++; }
"=" { print_token("Assignment Operator", yytext); operator_count++; }
"0b[01]+" { print_token("Binary Literal", yytext); complex_literal_count++; }
"0x[0-9a-fA-F]+" { print_token("Hexadecimal Literal", yytext); complex_literal_count++; }
"0[0-7]+" { print_token("Octal Literal", yytext); complex_literal_count++; }
"[0-9]+\\.[0-9]+([eE][-+]?[0-9]+)?" {
print_token("Float Literal", yytext);
complex_literal_count++;
}
"[0-9]+" { print_token("Integer Literal", yytext); literal_count++; }
"([^\\\"]|\\.)*" { print_token("String Literal", yytext); literal_count++; }
"'(\\\\.|[^\\\\])'" { print_token("Character Literal", yytext); literal_count++; }
"\\{|\\}" { print_token("Brace", yytext); nested_structure_count++; }
"\\(|\\)" { print_token("Parenthesis", yytext); punctuation_count++; }
"\\[|\\]" { print_token("Bracket", yytext); punctuation_count++; }
";" { print_token("Semicolon", yytext); punctuation_count++; }
"," { print_token("Comma", yytext); punctuation_count++; }
"sin|cos|log|sqrt" { print_token("Function", yytext); identifier_count++; }
%%
int main(void) {
yylex();
printf("\n--- Summary ---\n");
printf("Keywords: %d\n", keyword_count);
printf("Identifiers: %d\n", identifier_count);
printf("Operators: %d\n", operator_count);
printf("Compound Operators: %d\n", compound_operator_count);
printf("Ternary Operators: %d\n", ternary_operator_count);
printf("Literals: %d\n", literal_count);
printf("Complex Literals: %d\n", complex_literal_count);
printf("Comments: %d\n", comment_count);
printf("Punctuation: %d\n", punctuation_count);
printf("Nested Structures: %d\n", nested_structure_count);
printf("Invalid Tokens: %d\n", invalid_token_count);
return 0;
}
Test_Input.txt:
x = sin(45) + cos(30) * y / log(2) + z++;
result = a * b + c / d - e % f;
if (sqrt(25) >= 5) { x++; }
Output: