0% found this document useful (0 votes)

57 views7 pages

CS 3308 Programming Assignment Unit 4

The document outlines a Python program for processing text documents to create an inverted index and calculate term frequencies, including TF-IDF values. It defines various functions for text normalization, tokenization, stop word removal, and database management using SQLite. The program also includes a main execution function that sets up the database, processes a corpus of documents, and reports statistics on the processed data.

Uploaded by

Reg

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

57 views7 pages

CS 3308 Programming Assignment Unit 4

Uploaded by

Reg

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

You are on page 1/ 7

import string

import sys
import os
import re
import math
import sqlite3
import time
from typing import Dict, Set, List
from collections import defaultdict, Counter
from nltk.stem import PorterStemmer

# Define stop words

stop_words = set(["a", "an", "the", "and", "or", "but", "is", "are", "was",
"were", "in", "of", "to", "with"])

# Compile regex patterns for efficiency

chars = re.compile(r'\W+')
pattid = re.compile(r'(\d{3})/(\d{3})/(\d{3})')

# Global counters for corpus statistics

tokens = 0
documents = 0
terms = 0
stop_word_count = 0

# Database to store term information

database: Dict[str, 'Term'] = {}

class Term:
"""
Class to represent term information in the index
Stores term frequency, document frequency, and posting information
"""

def __init__(self):
self.termid: int = 0
self.termfreq: int = 0
self.docs: int = 0
self.docids: Dict[int, int] = {}

def splitchars(line: str) -> List[str]:

"""
Split input text into tokens based on non-word characters
Args:
line: Input text string
Returns:
List of tokens
"""
return chars.split(line)

def remove_stop_words(tokens: List[str]) -> List[str]:

global stop_word_count
filtered_tokens = [token for token in tokens if token not in stop_words]
stop_word_count += len(tokens) - len(filtered_tokens)
return filtered_tokens

ps = PorterStemmer()

def stem_tokens(tokens: List[str]) -> List[str]:

return [ps.stem(token) for token in tokens]

def remove_punctuation_tokens(tokens: List[str]) -> List[str]:

return [token for token in tokens if token and token[0] not in
string.punctuation]

def parsetoken(line: str) -> List[str]:

"""
Process a line of text to extract and index terms
Args:
line: Input text line
Returns:
List of processed tokens
"""
global documents, tokens, terms

# Normalize input text

line = line.replace('\t', ' ').strip()

# Split into tokens

token_list = splitchars(line)
token_list = remove_stop_words(token_list)
token_list = stem_tokens(token_list)
token_list = remove_punctuation_tokens(token_list)

for token in token_list:

# Clean and normalize token
token = token.replace('\n', '')
lower_token = token.lower().strip()

if not lower_token: # Skip empty tokens

continue

tokens += 1 # Increment total token count

# Add new term to database if not exists

if lower_token not in database:
terms += 1
database[lower_token] = Term()
database[lower_token].termid = terms
database[lower_token].docids = {}
database[lower_token].docs = 0

# Update posting information

if documents not in database[lower_token].docids:
database[lower_token].docs += 1
database[lower_token].docids[documents] = 0
# Update term frequency
database[lower_token].docids[documents] += 1
database[lower_token].termfreq += 1

return token_list

def process(filename: str) -> bool:

"""
Process a single document file
Args:
filename: Path to document file
Returns:
Boolean indicating success
"""
try:
# print(f"Reading file: {filename}")
with open(filename, 'r', encoding='utf-8') as file:
for line in file:
parsetoken(line)
return True
except IOError as e:
print(f"Error processing file {filename}: {str(e)}")
return False
except UnicodeDecodeError:
print(f"Unicode decode error in file {filename}")
return False

def walkdir(cur: sqlite3.Cursor, dirname: str) -> bool:

"""
Recursively walk through directory and process all files
Args:
cur: Database cursor
dirname: Directory path
Returns:
Boolean indicating success
"""
global documents

try:
# Get all files and directories
all_items = [f for f in os.listdir(dirname)
if os.path.isdir(os.path.join(dirname, f))
or os.path.isfile(os.path.join(dirname, f))]

for item in all_items:

full_path = os.path.join(dirname, item)
if os.path.isdir(full_path):
print(f"Entering directory: {full_path}")
walkdir(cur, full_path)
else:
# print(f"Processing file: {full_path}")
documents += 1
# Add document to dictionary
cur.execute("INSERT INTO DocumentDictionary VALUES (?, ?)",
(full_path, documents))
if not process(full_path):
print(f"Failed to process file: {full_path}")
return True

except Exception as e:
print(f"Error walking directory {dirname}: {str(e)}")
return False

def setup_database(cursor: sqlite3.Cursor):

"""
Set up database tables and indexes
Args:
cursor: Database cursor
"""
# Document Dictionary
cursor.execute("DROP TABLE IF EXISTS DocumentDictionary")

# Term Dictionary
cursor.execute("DROP TABLE IF EXISTS TermDictionary")

# Posting Table
cursor.execute("DROP TABLE IF EXISTS Posting")

# Create new tables

cursor.execute("""
CREATE TABLE IF NOT EXISTS DocumentDictionary (
DocumentName TEXT,
DocId INTEGER PRIMARY KEY
)
""")

cursor.execute("""
CREATE TABLE IF NOT EXISTS TermDictionary (
Term TEXT,
TermId INTEGER PRIMARY KEY
)
""")

cursor.execute("""
CREATE TABLE IF NOT EXISTS Posting (
TermId INTEGER,
DocId INTEGER,
tfidf REAL,
docfreq INTEGER,
termfreq INTEGER,
FOREIGN KEY(TermId) REFERENCES TermDictionary(TermId),
FOREIGN KEY(DocId) REFERENCES DocumentDictionary(DocId)
)
""")

# Create indexes
cursor.execute("CREATE INDEX IF NOT EXISTS idx_term ON
TermDictionary(Term)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_posting_term ON
Posting(TermId)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_posting_doc ON
Posting(DocId)")

def calculate_frequencies():
term_frequencies = []
document_frequencies = defaultdict(int)

for term, term_obj in database.items():

tf = Counter(term_obj.docids)
term_frequencies.append(tf)

for doc_id in tf:

document_frequencies[term] += 1

return term_frequencies, document_frequencies

def calculate_idf(document_frequencies, total_docs):

idf = {}
for term, df in document_frequencies.items():
idf[term] = math.log(total_docs / df)
return idf

def calculate_tf_idf(term_frequencies, idf):

tf_idf = []

for tf in term_frequencies:
tf_idf_doc = {}
for term, freq in tf.items():
tf_idf_doc[term] = freq * idf.get(term, 0)
tf_idf.append(tf_idf_doc)

return tf_idf

class InvertedIndex:
def __init__(self):
self.index = defaultdict(list)

def add_document(self, doc_id, tf_idf):

for term, weight in tf_idf.items():
self.index[term].append((doc_id, weight))

def report_statistics():
total_terms = sum(term.termfreq for term in database.values())
unique_terms = len(database)
print(f"Number of documents processed: {documents}")
print(f"Total number of terms parsed from all documents: {tokens}")
print(f"Total number of unique terms found and added to the index:
{unique_terms}")
print(f"Total number of terms found that matched one of the stop words:
{stop_word_count}")

def main():
"""
Main execution function
"""
# Record start time
start_time = time.localtime()
print(f"Start Time: {start_time.tm_hour:02d}:{start_time.tm_min:02d}")

# Initialize database
db_path = "cacm_index.db"
conn = sqlite3.connect(db_path)
conn.isolation_level = None # Enable autocommit
cursor = conn.cursor()

# Setup database tables

setup_database(cursor)

# Process corpus
corpus_path = "./cacm" # Update this path to match your environment
if not os.path.exists(corpus_path):
print(f"Error: Corpus directory not found at {corpus_path}")
return

walkdir(cursor, corpus_path)

# Calculate tf-idf for each term in each document

term_frequencies, document_frequencies = calculate_frequencies()
idf = calculate_idf(document_frequencies, documents)
tf_idf = calculate_tf_idf(term_frequencies, idf)

# Insert terms into database

for term, term_obj in database.items():
cursor.execute("INSERT INTO TermDictionary (Term, TermId) VALUES
(?, ?)",
(term, term_obj.termid))

# Insert posting information

for doc_id, freq in term_obj.docids.items():
tfidf = freq * math.log(documents / term_obj.docs)
cursor.execute("""
INSERT INTO Posting
(TermId, DocId, tfidf, docfreq, termfreq)
VALUES (?, ?, ?, ?, ?)
""", (term_obj.termid, doc_id, tfidf, term_obj.docs, freq))

# Commit changes and close connection

conn.commit()
conn.close()

# Print statistics
report_statistics()

end_time = time.localtime()
print(f"\nEnd Time: {end_time.tm_hour:02d}:{end_time.tm_min:02d}")

if __name__ == '__main__':
main()
statistics

Vishal Project.docx 2
No ratings yet
Vishal Project.docx 2
37 pages
Make it better and do what needs to be done and fi...
No ratings yet
Make it better and do what needs to be done and fi...
32 pages
Information Retrieval WA
No ratings yet
Information Retrieval WA
9 pages
UNIT-3: 1. What Do You Mean by Cell in Ms-Excel?
No ratings yet
UNIT-3: 1. What Do You Mean by Cell in Ms-Excel?
22 pages
Ai&Ml Bai601 Nlp Lab Manual
No ratings yet
Ai&Ml Bai601 Nlp Lab Manual
48 pages
Python Imp
No ratings yet
Python Imp
29 pages
cs-3308-unit-7-programming-assignment
No ratings yet
cs-3308-unit-7-programming-assignment
8 pages
TEXT FILE OPERATIONS
No ratings yet
TEXT FILE OPERATIONS
8 pages
Assessment - 2: - K Mary Nikitha
No ratings yet
Assessment - 2: - K Mary Nikitha
27 pages
DSA Paractical by Me
No ratings yet
DSA Paractical by Me
24 pages
IR practical
No ratings yet
IR practical
24 pages
SLIP's fsemMCA
No ratings yet
SLIP's fsemMCA
19 pages
Ballerono_cappuchino
No ratings yet
Ballerono_cappuchino
10 pages
cs project documentation-merged
No ratings yet
cs project documentation-merged
19 pages
CS 3308 Programming Assignment Unit 2
No ratings yet
CS 3308 Programming Assignment Unit 2
10 pages
IR
No ratings yet
IR
12 pages
AP19110010110 Lab Assignment-2 - Jupyter Notebook
No ratings yet
AP19110010110 Lab Assignment-2 - Jupyter Notebook
18 pages
7 Python Report Eddd
No ratings yet
7 Python Report Eddd
16 pages
assignment
No ratings yet
assignment
14 pages
UI Policy & Data Policy in ServiceNow
No ratings yet
UI Policy & Data Policy in ServiceNow
4 pages
Web Mining DA
No ratings yet
Web Mining DA
13 pages
Python Program To Implement A Stack
No ratings yet
Python Program To Implement A Stack
12 pages
IR Journal (Printable)
No ratings yet
IR Journal (Printable)
20 pages
vanessaa_wim
No ratings yet
vanessaa_wim
9 pages
Lab - Activity-Iii: ST ND
No ratings yet
Lab - Activity-Iii: ST ND
9 pages
Codesrepl
No ratings yet
Codesrepl
16 pages
1_5089492269589857342(1)
No ratings yet
1_5089492269589857342(1)
7 pages
Lab3 IR BIM
No ratings yet
Lab3 IR BIM
14 pages
AI Assignment: Asad Nasir - 37 Muhammad Usman Ali - 29 Momin - 49
No ratings yet
AI Assignment: Asad Nasir - 37 Muhammad Usman Ali - 29 Momin - 49
7 pages
Python Ultimate Guide
100% (1)
Python Ultimate Guide
10 pages
10 Python&Hadoop
No ratings yet
10 Python&Hadoop
32 pages
Assignment 2 IR
No ratings yet
Assignment 2 IR
6 pages
20BCE1779 - Web Mining - Lab-1
No ratings yet
20BCE1779 - Web Mining - Lab-1
9 pages
Rescued Document
No ratings yet
Rescued Document
4 pages
CS Practical - 2
No ratings yet
CS Practical - 2
19 pages
index
No ratings yet
index
6 pages
IP Project Saleha
No ratings yet
IP Project Saleha
34 pages
PYTHONa 7
No ratings yet
PYTHONa 7
15 pages
Do68 Rahulsinha Experiment11
No ratings yet
Do68 Rahulsinha Experiment11
3 pages
IR - 754 All Practical
No ratings yet
IR - 754 All Practical
21 pages
Python Code Examples
No ratings yet
Python Code Examples
30 pages
imp programs[1]
No ratings yet
imp programs[1]
8 pages
Python Imp 001
No ratings yet
Python Imp 001
16 pages
inverted index-unit-3
No ratings yet
inverted index-unit-3
11 pages
Import Sqlite3
No ratings yet
Import Sqlite3
2 pages
vertopal.com_ir_op_6
No ratings yet
vertopal.com_ir_op_6
2 pages
DS 8-12
No ratings yet
DS 8-12
5 pages
Arnam Program File
No ratings yet
Arnam Program File
15 pages
IR Practical Code
No ratings yet
IR Practical Code
13 pages
Excel Database Management
No ratings yet
Excel Database Management
3 pages
pre-1-cs-ans
No ratings yet
pre-1-cs-ans
9 pages
Python Record Manual
No ratings yet
Python Record Manual
18 pages
Sahil Malhotra 16 BCE 0113 Web Mining L51+L52: 1. Universal Crawling 1.1. CODE
No ratings yet
Sahil Malhotra 16 BCE 0113 Web Mining L51+L52: 1. Universal Crawling 1.1. CODE
11 pages
IR Practical 1
No ratings yet
IR Practical 1
5 pages
Python Cheat Sheet - The Basics Coursera
No ratings yet
Python Cheat Sheet - The Basics Coursera
2 pages
Cs Practical 12
No ratings yet
Cs Practical 12
1 page
Creatorcon: Building Your First Integrationhub Spoke: Jonatan Jonny' Jardi
No ratings yet
Creatorcon: Building Your First Integrationhub Spoke: Jonatan Jonny' Jardi
40 pages
Abhishek Project Report
No ratings yet
Abhishek Project Report
33 pages
Best Practices in QT Quick
No ratings yet
Best Practices in QT Quick
41 pages
E-Commerce Based Internship Report by Raju Mondal ID: 161-15-7228
No ratings yet
E-Commerce Based Internship Report by Raju Mondal ID: 161-15-7228
25 pages
Lecture 2
No ratings yet
Lecture 2
29 pages
Log
No ratings yet
Log
23 pages
SIOS Protection Suite For Linux Network Attached Storage Reovery Kit v9.2.2
No ratings yet
SIOS Protection Suite For Linux Network Attached Storage Reovery Kit v9.2.2
23 pages
Lesson 31
No ratings yet
Lesson 31
12 pages
Https Raw - Githubusercontent.com Joelgrus Data-Science-From-Scratch Master Code Natural Language Processing
No ratings yet
Https Raw - Githubusercontent.com Joelgrus Data-Science-From-Scratch Master Code Natural Language Processing
5 pages
A-163 Bhagyashree Patil Exp11
No ratings yet
A-163 Bhagyashree Patil Exp11
3 pages
Passenger Flow Detection Management Platform (English Version)
No ratings yet
Passenger Flow Detection Management Platform (English Version)
14 pages
Pyhton Data Structure CheatSheet
No ratings yet
Pyhton Data Structure CheatSheet
5 pages
Ajax Modal Pop Up Extender in ASP
No ratings yet
Ajax Modal Pop Up Extender in ASP
7 pages
Python Cheat Sheet - The Basics Coursera
No ratings yet
Python Cheat Sheet - The Basics Coursera
2 pages
Log
No ratings yet
Log
5 pages
Learning Guide Unit 1 _ Home
No ratings yet
Learning Guide Unit 1 _ Home
10 pages
Learning Guide Unit 6 _ Home
No ratings yet
Learning Guide Unit 6 _ Home
10 pages
Sample
No ratings yet
Sample
6 pages
CS 3308 Learning Journal Unit 5
No ratings yet
CS 3308 Learning Journal Unit 5
6 pages
Problem Statement: Travel Website 2021-22
No ratings yet
Problem Statement: Travel Website 2021-22
8 pages
MATH 1281 - Unit 4 Discussion Assignment
No ratings yet
MATH 1281 - Unit 4 Discussion Assignment
5 pages
System Administration: Perl Lab Manual 01
No ratings yet
System Administration: Perl Lab Manual 01
7 pages
Apex Best Practices Salesforce Notes
No ratings yet
Apex Best Practices Salesforce Notes
3 pages
ENGL 1102-Unit 2 Discussion Assignment
No ratings yet
ENGL 1102-Unit 2 Discussion Assignment
3 pages
Lab 02 - First Assembly Language Program
No ratings yet
Lab 02 - First Assembly Language Program
7 pages
CS 3308 Learning Journal Unit 7
No ratings yet
CS 3308 Learning Journal Unit 7
5 pages
MATH 1281 - Unit 3 Assignment
No ratings yet
MATH 1281 - Unit 3 Assignment
5 pages
MATH 1281 - Unit 8 Assignment
100% (1)
MATH 1281 - Unit 8 Assignment
2 pages
Prashant Yadav - CSE (CGC)
No ratings yet
Prashant Yadav - CSE (CGC)
4 pages
MATH 1302 - Unit 2 Discussion Assignment
No ratings yet
MATH 1302 - Unit 2 Discussion Assignment
4 pages
SAP AIF Simple Outbound
No ratings yet
SAP AIF Simple Outbound
7 pages
Getting Started With Python Programming
No ratings yet
Getting Started With Python Programming
2 pages
Assignment # 5 - 5
No ratings yet
Assignment # 5 - 5
6 pages
MATH 1281 - Unit 5 Assignment
No ratings yet
MATH 1281 - Unit 5 Assignment
4 pages
Pfe Faq
No ratings yet
Pfe Faq
2 pages
MATH 1280-Unit 1 Discussion Assignment
No ratings yet
MATH 1280-Unit 1 Discussion Assignment
3 pages
Harsh Kumar - Resume
No ratings yet
Harsh Kumar - Resume
2 pages
Meshmixer Log
No ratings yet
Meshmixer Log
2 pages
Intro To DSs Lab 2 - RMI Lab 2 (SimpleRmiCalculator 1)
100% (1)
Intro To DSs Lab 2 - RMI Lab 2 (SimpleRmiCalculator 1)
3 pages
C - Programming Lab
100% (2)
C - Programming Lab
4 pages
MySQL Commands Cheat Sheet
No ratings yet
MySQL Commands Cheat Sheet
2 pages
List of Practical - Object Oriented Programming Using Java
No ratings yet
List of Practical - Object Oriented Programming Using Java
2 pages
MATH 1280-Unit 2 Discussion Assignment
No ratings yet
MATH 1280-Unit 2 Discussion Assignment
2 pages
Web Engineering-II Using ASP Dot NET: by Adnan Amin
No ratings yet
Web Engineering-II Using ASP Dot NET: by Adnan Amin
13 pages
Bpops103 Assignment2
No ratings yet
Bpops103 Assignment2
1 page
The Essential R Reference
From Everand
The Essential R Reference
Mark Gardener
No ratings yet
Computer Engineering Laboratory Solution Primer
From Everand
Computer Engineering Laboratory Solution Primer
Karan Bhandari
No ratings yet