0% found this document useful (0 votes)

89 views7 pages

CS 3308 Programming Assignment Unit 4

The document outlines a Python program for processing text documents to create an inverted index and calculate term frequencies, including TF-IDF values. It defines various functions for text normalization, tokenization, stop word removal, and database management using SQLite. The program also includes a main execution function that sets up the database, processes a corpus of documents, and reports statistics on the processed data.

Uploaded by

Reg

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

89 views7 pages

CS 3308 Programming Assignment Unit 4

Uploaded by

Reg

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

You are on page 1/ 7

import string

import sys
import os
import re
import math
import sqlite3
import time
from typing import Dict, Set, List
from collections import defaultdict, Counter
from nltk.stem import PorterStemmer

# Define stop words

stop_words = set(["a", "an", "the", "and", "or", "but", "is", "are", "was",
"were", "in", "of", "to", "with"])

# Compile regex patterns for efficiency

chars = re.compile(r'\W+')
pattid = re.compile(r'(\d{3})/(\d{3})/(\d{3})')

# Global counters for corpus statistics

tokens = 0
documents = 0
terms = 0
stop_word_count = 0

# Database to store term information

database: Dict[str, 'Term'] = {}

class Term:
"""
Class to represent term information in the index
Stores term frequency, document frequency, and posting information
"""

def __init__(self):
self.termid: int = 0
self.termfreq: int = 0
self.docs: int = 0
self.docids: Dict[int, int] = {}

def splitchars(line: str) -> List[str]:

"""
Split input text into tokens based on non-word characters
Args:
line: Input text string
Returns:
List of tokens
"""
return chars.split(line)

def remove_stop_words(tokens: List[str]) -> List[str]:

global stop_word_count
filtered_tokens = [token for token in tokens if token not in stop_words]
stop_word_count += len(tokens) - len(filtered_tokens)
return filtered_tokens

ps = PorterStemmer()

def stem_tokens(tokens: List[str]) -> List[str]:

return [ps.stem(token) for token in tokens]

def remove_punctuation_tokens(tokens: List[str]) -> List[str]:

return [token for token in tokens if token and token[0] not in
string.punctuation]

def parsetoken(line: str) -> List[str]:

"""
Process a line of text to extract and index terms
Args:
line: Input text line
Returns:
List of processed tokens
"""
global documents, tokens, terms

# Normalize input text

line = line.replace('\t', ' ').strip()

# Split into tokens

token_list = splitchars(line)
token_list = remove_stop_words(token_list)
token_list = stem_tokens(token_list)
token_list = remove_punctuation_tokens(token_list)

for token in token_list:

# Clean and normalize token
token = token.replace('\n', '')
lower_token = token.lower().strip()

if not lower_token: # Skip empty tokens

continue

tokens += 1 # Increment total token count

# Add new term to database if not exists

if lower_token not in database:
terms += 1
database[lower_token] = Term()
database[lower_token].termid = terms
database[lower_token].docids = {}
database[lower_token].docs = 0

# Update posting information

if documents not in database[lower_token].docids:
database[lower_token].docs += 1
database[lower_token].docids[documents] = 0
# Update term frequency
database[lower_token].docids[documents] += 1
database[lower_token].termfreq += 1

return token_list

def process(filename: str) -> bool:

"""
Process a single document file
Args:
filename: Path to document file
Returns:
Boolean indicating success
"""
try:
# print(f"Reading file: {filename}")
with open(filename, 'r', encoding='utf-8') as file:
for line in file:
parsetoken(line)
return True
except IOError as e:
print(f"Error processing file {filename}: {str(e)}")
return False
except UnicodeDecodeError:
print(f"Unicode decode error in file {filename}")
return False

def walkdir(cur: sqlite3.Cursor, dirname: str) -> bool:

"""
Recursively walk through directory and process all files
Args:
cur: Database cursor
dirname: Directory path
Returns:
Boolean indicating success
"""
global documents

try:
# Get all files and directories
all_items = [f for f in os.listdir(dirname)
if os.path.isdir(os.path.join(dirname, f))
or os.path.isfile(os.path.join(dirname, f))]

for item in all_items:

full_path = os.path.join(dirname, item)
if os.path.isdir(full_path):
print(f"Entering directory: {full_path}")
walkdir(cur, full_path)
else:
# print(f"Processing file: {full_path}")
documents += 1
# Add document to dictionary
cur.execute("INSERT INTO DocumentDictionary VALUES (?, ?)",
(full_path, documents))
if not process(full_path):
print(f"Failed to process file: {full_path}")
return True

except Exception as e:
print(f"Error walking directory {dirname}: {str(e)}")
return False

def setup_database(cursor: sqlite3.Cursor):

"""
Set up database tables and indexes
Args:
cursor: Database cursor
"""
# Document Dictionary
cursor.execute("DROP TABLE IF EXISTS DocumentDictionary")

# Term Dictionary
cursor.execute("DROP TABLE IF EXISTS TermDictionary")

# Posting Table
cursor.execute("DROP TABLE IF EXISTS Posting")

# Create new tables

cursor.execute("""
CREATE TABLE IF NOT EXISTS DocumentDictionary (
DocumentName TEXT,
DocId INTEGER PRIMARY KEY
)
""")

cursor.execute("""
CREATE TABLE IF NOT EXISTS TermDictionary (
Term TEXT,
TermId INTEGER PRIMARY KEY
)
""")

cursor.execute("""
CREATE TABLE IF NOT EXISTS Posting (
TermId INTEGER,
DocId INTEGER,
tfidf REAL,
docfreq INTEGER,
termfreq INTEGER,
FOREIGN KEY(TermId) REFERENCES TermDictionary(TermId),
FOREIGN KEY(DocId) REFERENCES DocumentDictionary(DocId)
)
""")

# Create indexes
cursor.execute("CREATE INDEX IF NOT EXISTS idx_term ON
TermDictionary(Term)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_posting_term ON
Posting(TermId)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_posting_doc ON
Posting(DocId)")

def calculate_frequencies():
term_frequencies = []
document_frequencies = defaultdict(int)

for term, term_obj in database.items():

tf = Counter(term_obj.docids)
term_frequencies.append(tf)

for doc_id in tf:

document_frequencies[term] += 1

return term_frequencies, document_frequencies

def calculate_idf(document_frequencies, total_docs):

idf = {}
for term, df in document_frequencies.items():
idf[term] = math.log(total_docs / df)
return idf

def calculate_tf_idf(term_frequencies, idf):

tf_idf = []

for tf in term_frequencies:
tf_idf_doc = {}
for term, freq in tf.items():
tf_idf_doc[term] = freq * idf.get(term, 0)
tf_idf.append(tf_idf_doc)

return tf_idf

class InvertedIndex:
def __init__(self):
self.index = defaultdict(list)

def add_document(self, doc_id, tf_idf):

for term, weight in tf_idf.items():
self.index[term].append((doc_id, weight))

def report_statistics():
total_terms = sum(term.termfreq for term in database.values())
unique_terms = len(database)
print(f"Number of documents processed: {documents}")
print(f"Total number of terms parsed from all documents: {tokens}")
print(f"Total number of unique terms found and added to the index:
{unique_terms}")
print(f"Total number of terms found that matched one of the stop words:
{stop_word_count}")

def main():
"""
Main execution function
"""
# Record start time
start_time = time.localtime()
print(f"Start Time: {start_time.tm_hour:02d}:{start_time.tm_min:02d}")

# Initialize database
db_path = "cacm_index.db"
conn = sqlite3.connect(db_path)
conn.isolation_level = None # Enable autocommit
cursor = conn.cursor()

# Setup database tables

setup_database(cursor)

# Process corpus
corpus_path = "./cacm" # Update this path to match your environment
if not os.path.exists(corpus_path):
print(f"Error: Corpus directory not found at {corpus_path}")
return

walkdir(cursor, corpus_path)

# Calculate tf-idf for each term in each document

term_frequencies, document_frequencies = calculate_frequencies()
idf = calculate_idf(document_frequencies, documents)
tf_idf = calculate_tf_idf(term_frequencies, idf)

# Insert terms into database

for term, term_obj in database.items():
cursor.execute("INSERT INTO TermDictionary (Term, TermId) VALUES
(?, ?)",
(term, term_obj.termid))

# Insert posting information

for doc_id, freq in term_obj.docids.items():
tfidf = freq * math.log(documents / term_obj.docs)
cursor.execute("""
INSERT INTO Posting
(TermId, DocId, tfidf, docfreq, termfreq)
VALUES (?, ?, ?, ?, ?)
""", (term_obj.termid, doc_id, tfidf, term_obj.docs, freq))

# Commit changes and close connection

conn.commit()
conn.close()

# Print statistics
report_statistics()

end_time = time.localtime()
print(f"\nEnd Time: {end_time.tm_hour:02d}:{end_time.tm_min:02d}")

if __name__ == '__main__':
main()
statistics

Program Enrollment Test Quiz - WorldQuant University
No ratings yet
Program Enrollment Test Quiz - WorldQuant University
4 pages
Text Summarization Using NLP Final
No ratings yet
Text Summarization Using NLP Final
38 pages
Delhivery Feature Engineering Cs
No ratings yet
Delhivery Feature Engineering Cs
46 pages
Programming Assignment Page 1 of 11
No ratings yet
Programming Assignment Page 1 of 11
11 pages
Movie Recommendation System: Using Machine Learning
No ratings yet
Movie Recommendation System: Using Machine Learning
7 pages
CS 3308 Discussion Assignment Unit 6
No ratings yet
CS 3308 Discussion Assignment Unit 6
5 pages
Titanic Survival Prediction
No ratings yet
Titanic Survival Prediction
14 pages
Python and NLP Notes
No ratings yet
Python and NLP Notes
32 pages
Python Class Notes
No ratings yet
Python Class Notes
18 pages
Visualization - Python Data Analysis
No ratings yet
Visualization - Python Data Analysis
13 pages
Studying For A Tech Interview Sucks
No ratings yet
Studying For A Tech Interview Sucks
8 pages
End-to-End Machine Learning Project (Bootcamp)
No ratings yet
End-to-End Machine Learning Project (Bootcamp)
415 pages
HW1
100% (1)
HW1
8 pages
ML Lab Manual
No ratings yet
ML Lab Manual
90 pages
Interview PDF
No ratings yet
Interview PDF
100 pages
Visvesvaraya Technological University: "File Compression Using Huffman Coding"
No ratings yet
Visvesvaraya Technological University: "File Compression Using Huffman Coding"
5 pages
Python Cheatsheet
No ratings yet
Python Cheatsheet
4 pages
Phase 2 Final
100% (1)
Phase 2 Final
65 pages
Ch8 Data Wrangling Join, Combine, and Reshape
No ratings yet
Ch8 Data Wrangling Join, Combine, and Reshape
13 pages
Soft Computing UNIT 1
No ratings yet
Soft Computing UNIT 1
10 pages
Simplified Guide To Fingerprint Analysis
No ratings yet
Simplified Guide To Fingerprint Analysis
13 pages
DW
No ratings yet
DW
29 pages
Tree Traversals (Inorder, Preorder and Postorder)
No ratings yet
Tree Traversals (Inorder, Preorder and Postorder)
4 pages
Data Mining Lab Manual
No ratings yet
Data Mining Lab Manual
34 pages
Data Structure BSIT 3rd Semester
No ratings yet
Data Structure BSIT 3rd Semester
30 pages
Unit-1 Basics of Algorithms and Mathematics
No ratings yet
Unit-1 Basics of Algorithms and Mathematics
47 pages
Data Wrangling (Data Preprocessing) : Practical Assessment 1
No ratings yet
Data Wrangling (Data Preprocessing) : Practical Assessment 1
5 pages
Chandigarh Group of Colleges College of Engineering Landran, Mohali
No ratings yet
Chandigarh Group of Colleges College of Engineering Landran, Mohali
47 pages
Unit-Ii Knowledge Representation and Reasoning Part-A
No ratings yet
Unit-Ii Knowledge Representation and Reasoning Part-A
10 pages
Natural Language Processing With Python & NLTK Cheat Sheet: by Via
No ratings yet
Natural Language Processing With Python & NLTK Cheat Sheet: by Via
2 pages
Keyphrase Extraction (3rd Review)
No ratings yet
Keyphrase Extraction (3rd Review)
22 pages
UNIT-5 Data Visualization Using Dataframe
No ratings yet
UNIT-5 Data Visualization Using Dataframe
38 pages
MySQL Assignment
No ratings yet
MySQL Assignment
4 pages
Deep Neural Network
No ratings yet
Deep Neural Network
12 pages
Independent Component Analysis: Bhagesh Bhutani (20) Chayan Sharma (21) Deepak
No ratings yet
Independent Component Analysis: Bhagesh Bhutani (20) Chayan Sharma (21) Deepak
15 pages
Sentiment Analysis Report
No ratings yet
Sentiment Analysis Report
31 pages
E-Mail Spam Detection by Using NLP and Naïve Bayes Classification Through Machine Learning
No ratings yet
E-Mail Spam Detection by Using NLP and Naïve Bayes Classification Through Machine Learning
5 pages
Module 2
No ratings yet
Module 2
20 pages
CS 3440 Graded Quiz Unit 6
No ratings yet
CS 3440 Graded Quiz Unit 6
7 pages
Data Mining-Outlier Analysis
No ratings yet
Data Mining-Outlier Analysis
6 pages
"Sentiment Analysis of Imdb Movie Reviews": A Project Report
No ratings yet
"Sentiment Analysis of Imdb Movie Reviews": A Project Report
27 pages
Weka Tutorial
No ratings yet
Weka Tutorial
2 pages
Python Sets Worksheet 2
No ratings yet
Python Sets Worksheet 2
7 pages
01-Introduction Machine Learning
100% (1)
01-Introduction Machine Learning
48 pages
Python Regular Expression - Exercises, Practice, Solution - W3resource12
No ratings yet
Python Regular Expression - Exercises, Practice, Solution - W3resource12
1 page
LP3 - ML Mini-Project Report Format Shreeyas
No ratings yet
LP3 - ML Mini-Project Report Format Shreeyas
13 pages
UCR Time Series Classification Archive
No ratings yet
UCR Time Series Classification Archive
14 pages
Natural Language Processing Notes
No ratings yet
Natural Language Processing Notes
61 pages
Python Exercises Documentation: Release 1.0
No ratings yet
Python Exercises Documentation: Release 1.0
15 pages
Software Mining Repository Practical
No ratings yet
Software Mining Repository Practical
28 pages
Machine Learnin
100% (2)
Machine Learnin
23 pages
Mini Project HPC
No ratings yet
Mini Project HPC
17 pages
UNIT 3 (Chapter 2) Pandas
No ratings yet
UNIT 3 (Chapter 2) Pandas
43 pages
Hands On Scripting
No ratings yet
Hands On Scripting
24 pages
Seminar Report Machine Learning
No ratings yet
Seminar Report Machine Learning
20 pages
Notes On Deep Learning Theory
No ratings yet
Notes On Deep Learning Theory
68 pages
Competitive Learning Neural Network
No ratings yet
Competitive Learning Neural Network
62 pages
Mining Data Streams
No ratings yet
Mining Data Streams
67 pages
Distributed Database: GDC Thana Semester 6
No ratings yet
Distributed Database: GDC Thana Semester 6
10 pages
Assignment 4
No ratings yet
Assignment 4
11 pages
Learning Guide Unit 6 - Home
No ratings yet
Learning Guide Unit 6 - Home
10 pages
Learning Guide Unit 1 - Home
No ratings yet
Learning Guide Unit 1 - Home
10 pages
MATH 1302 - Unit 2 Discussion Assignment
No ratings yet
MATH 1302 - Unit 2 Discussion Assignment
4 pages
CS 3308 Learning Journal Unit 5
No ratings yet
CS 3308 Learning Journal Unit 5
6 pages
CS 3308 Learning Journal Unit 7
No ratings yet
CS 3308 Learning Journal Unit 7
5 pages
MATH 1281 - Unit 8 Assignment
100% (1)
MATH 1281 - Unit 8 Assignment
2 pages
MATH 1281 - Unit 4 Discussion Assignment
No ratings yet
MATH 1281 - Unit 4 Discussion Assignment
5 pages
MATH 1281 - Unit 3 Assignment
No ratings yet
MATH 1281 - Unit 3 Assignment
5 pages
MATH 1281 - Unit 5 Assignment
No ratings yet
MATH 1281 - Unit 5 Assignment
4 pages
ENGL 1102-Unit 2 Discussion Assignment
No ratings yet
ENGL 1102-Unit 2 Discussion Assignment
3 pages
MATH 1280-Unit 1 Discussion Assignment
No ratings yet
MATH 1280-Unit 1 Discussion Assignment
3 pages
MATH 1280-Unit 2 Discussion Assignment
No ratings yet
MATH 1280-Unit 2 Discussion Assignment
2 pages
Lecture 2
No ratings yet
Lecture 2
29 pages
Maintenance Manual
No ratings yet
Maintenance Manual
4 pages
PTC-Princeton-Tech-PT8211-S - C92004 (EN)
No ratings yet
PTC-Princeton-Tech-PT8211-S - C92004 (EN)
4 pages
HD465-7 Parts Book PDF
No ratings yet
HD465-7 Parts Book PDF
670 pages
Force, WPE, Machine Board Paper Part I (1-10) PDF
67% (3)
Force, WPE, Machine Board Paper Part I (1-10) PDF
10 pages
F2 - F2E - 911401578841 - EU - en - AA - PROF.FP
No ratings yet
F2 - F2E - 911401578841 - EU - en - AA - PROF.FP
2 pages
SKF Bearing Grease Selection Chart: LGMT 3
No ratings yet
SKF Bearing Grease Selection Chart: LGMT 3
1 page
EG2000 Manual PDF
No ratings yet
EG2000 Manual PDF
620 pages
Lecture 1 - Introduction and Sieve (Fine - Coarse)
No ratings yet
Lecture 1 - Introduction and Sieve (Fine - Coarse)
7 pages
Dhruv Lohana - KP Astrology Software
No ratings yet
Dhruv Lohana - KP Astrology Software
3 pages
International Journal of Scientific and Statistical Computing (IJSSC) Volume (1) Issue
No ratings yet
International Journal of Scientific and Statistical Computing (IJSSC) Volume (1) Issue
18 pages
Numericals:: 1300 4740 E Ti/ti 4740/1300 3.646
No ratings yet
Numericals:: 1300 4740 E Ti/ti 4740/1300 3.646
36 pages
IP Practical File - Edited
No ratings yet
IP Practical File - Edited
48 pages
MODUL
No ratings yet
MODUL
111 pages
Operations Analytics - Tools and Techniques
No ratings yet
Operations Analytics - Tools and Techniques
6 pages
840 D ProtectionLevel Concept
No ratings yet
840 D ProtectionLevel Concept
8 pages
Modbus/TCP Communication Via IE CP
No ratings yet
Modbus/TCP Communication Via IE CP
31 pages
KoshinoSlides PDF
No ratings yet
KoshinoSlides PDF
18 pages
G+1 & 4 2023 DM Exam
100% (3)
G+1 & 4 2023 DM Exam
4 pages
Informecial App Analysis Question Test NEW
No ratings yet
Informecial App Analysis Question Test NEW
4 pages
Dell Technologies Networking OS10 - How To Configure Port-Channels
No ratings yet
Dell Technologies Networking OS10 - How To Configure Port-Channels
3 pages
Learnsignal September 2021 12-Week ACCA Study Plan: Select Your Subject Below and Start Studying!
No ratings yet
Learnsignal September 2021 12-Week ACCA Study Plan: Select Your Subject Below and Start Studying!
1 page
Dry Type Arc Welding Machine
100% (1)
Dry Type Arc Welding Machine
11 pages
LG Ducted Split System Brochure 2009
No ratings yet
LG Ducted Split System Brochure 2009
8 pages
Led Lighting Is Thorn
No ratings yet
Led Lighting Is Thorn
88 pages
Procedure arh / Arh Gas in Procedure-Algeria
No ratings yet
Procedure arh / Arh Gas in Procedure-Algeria
19 pages
Bento Bag (Azuma Bukuro)
100% (1)
Bento Bag (Azuma Bukuro)
9 pages
Instructions To Candidates: Common Course End Exam (CCEE) - August 2013
No ratings yet
Instructions To Candidates: Common Course End Exam (CCEE) - August 2013
17 pages
Refrigeration Laboratory Manual Expt A
No ratings yet
Refrigeration Laboratory Manual Expt A
9 pages
Estimation of Heavy Metal Concentration in The Pearl River Estuarine Waters From Remote Sensing Data
No ratings yet
Estimation of Heavy Metal Concentration in The Pearl River Estuarine Waters From Remote Sensing Data
4 pages
A Single Typical Trichoscopic Feature Is Predictive of Tinea Capitis: A Prospective Multicentre Study
No ratings yet
A Single Typical Trichoscopic Feature Is Predictive of Tinea Capitis: A Prospective Multicentre Study
6 pages

CS 3308 Programming Assignment Unit 4

Uploaded by

CS 3308 Programming Assignment Unit 4

Uploaded by

import string

# Define stop words

# Compile regex patterns for efficiency

# Global counters for corpus statistics

# Database to store term information

def splitchars(line: str) -> List[str]:

def remove_stop_words(tokens: List[str]) -> List[str]:

def stem_tokens(tokens: List[str]) -> List[str]:

def remove_punctuation_tokens(tokens: List[str]) -> List[str]:

def parsetoken(line: str) -> List[str]:

# Normalize input text

# Split into tokens

for token in token_list:

if not lower_token: # Skip empty tokens

tokens += 1 # Increment total token count

# Add new term to database if not exists

# Update posting information

def process(filename: str) -> bool:

def walkdir(cur: sqlite3.Cursor, dirname: str) -> bool:

for item in all_items:

def setup_database(cursor: sqlite3.Cursor):

# Create new tables

for term, term_obj in database.items():

for doc_id in tf:

return term_frequencies, document_frequencies

def calculate_idf(document_frequencies, total_docs):

def calculate_tf_idf(term_frequencies, idf):

def add_document(self, doc_id, tf_idf):

# Setup database tables

# Calculate tf-idf for each term in each document

# Insert terms into database

# Insert posting information

# Commit changes and close connection

You might also like