CS 3308 Programming Assignment Unit 4
CS 3308 Programming Assignment Unit 4
import sys
import os
import re
import math
import sqlite3
import time
from typing import Dict, Set, List
from collections import defaultdict, Counter
from nltk.stem import PorterStemmer
class Term:
"""
Class to represent term information in the index
Stores term frequency, document frequency, and posting information
"""
def __init__(self):
self.termid: int = 0
self.termfreq: int = 0
self.docs: int = 0
self.docids: Dict[int, int] = {}
ps = PorterStemmer()
return token_list
try:
# Get all files and directories
all_items = [f for f in os.listdir(dirname)
if os.path.isdir(os.path.join(dirname, f))
or os.path.isfile(os.path.join(dirname, f))]
except Exception as e:
print(f"Error walking directory {dirname}: {str(e)}")
return False
# Term Dictionary
cursor.execute("DROP TABLE IF EXISTS TermDictionary")
# Posting Table
cursor.execute("DROP TABLE IF EXISTS Posting")
cursor.execute("""
CREATE TABLE IF NOT EXISTS TermDictionary (
Term TEXT,
TermId INTEGER PRIMARY KEY
)
""")
cursor.execute("""
CREATE TABLE IF NOT EXISTS Posting (
TermId INTEGER,
DocId INTEGER,
tfidf REAL,
docfreq INTEGER,
termfreq INTEGER,
FOREIGN KEY(TermId) REFERENCES TermDictionary(TermId),
FOREIGN KEY(DocId) REFERENCES DocumentDictionary(DocId)
)
""")
# Create indexes
cursor.execute("CREATE INDEX IF NOT EXISTS idx_term ON
TermDictionary(Term)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_posting_term ON
Posting(TermId)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_posting_doc ON
Posting(DocId)")
def calculate_frequencies():
term_frequencies = []
document_frequencies = defaultdict(int)
for tf in term_frequencies:
tf_idf_doc = {}
for term, freq in tf.items():
tf_idf_doc[term] = freq * idf.get(term, 0)
tf_idf.append(tf_idf_doc)
return tf_idf
class InvertedIndex:
def __init__(self):
self.index = defaultdict(list)
def report_statistics():
total_terms = sum(term.termfreq for term in database.values())
unique_terms = len(database)
print(f"Number of documents processed: {documents}")
print(f"Total number of terms parsed from all documents: {tokens}")
print(f"Total number of unique terms found and added to the index:
{unique_terms}")
print(f"Total number of terms found that matched one of the stop words:
{stop_word_count}")
def main():
"""
Main execution function
"""
# Record start time
start_time = time.localtime()
print(f"Start Time: {start_time.tm_hour:02d}:{start_time.tm_min:02d}")
# Initialize database
db_path = "cacm_index.db"
conn = sqlite3.connect(db_path)
conn.isolation_level = None # Enable autocommit
cursor = conn.cursor()
# Process corpus
corpus_path = "./cacm" # Update this path to match your environment
if not os.path.exists(corpus_path):
print(f"Error: Corpus directory not found at {corpus_path}")
return
walkdir(cursor, corpus_path)
# Print statistics
report_statistics()
end_time = time.localtime()
print(f"\nEnd Time: {end_time.tm_hour:02d}:{end_time.tm_min:02d}")
if __name__ == '__main__':
main()
statistics