Applecore Pages on Microsoft Access

Counting the frequency of words in a text file

If you have a text file with large amounts of text in, and you wish to count the frequency of words in the file, but at the same time exclude common words such as 'the' and 'a', then the code below might be useful.

Firstly, you need to create a table to store the extracted words in, called tblWord, with just one field, of type Text, that is called Word. You need another table to store words that you don't wish to be counted in, called tblExclude, with one text field called Exclude.

You also need to create a query to show the frequency of the words, called qryFrequency. The example below will only display the 10 most frequest words, but this can be easily modified to return a different number, or even all records:

SELECT DISTINCTROW TOP 10 tblWord.Word, Count(tblWord.Word) AS Frequency
FROM tblWord LEFT JOIN tblExclude ON tblWord.Word = tblExclude.Exclude
GROUP BY tblWord.Word, tblExclude.Exclude
HAVING (((tblExclude.Exclude) Is Null))
ORDER BY Count(tblWord.Word) DESC;

And then paste the code below into a new module:

Option Compare Database
Option Explicit
 
Public Sub sCountFrequency()
    On Error GoTo E_Handle
    Dim strFile As String, strSQL As String, strInput, strText As String
    Dim intFile As Integer
    Dim intWordCOunt As Integer, intLoop As Integer
    Dim db As Database
    intFile = FreeFile
    strFile = "C:\test\test.txt"
    Open strFile For Input As intFile
    Do
        Line Input #intFile, strInput
        strText = strText & strInput
    Loop Until EOF(intFile)
    Close intFile
    strText = fReplaceCharacters(strText, ",", " ")
    strText = fReplaceCharacters(strText, Chr(34), "")
    strText = fReplaceCharacters(strText, ".", " ")
    strText = fReplaceCharacters(strText, " - ", " ")
    strText = fReplaceCharacters(strText, " ", " ")
    intWordCOunt = CountCSWords(strText)
    Set db = DBEngine(0)(0)
    db.Execute "DELETE * FROM tblWord;"
    For intLoop = 1 To intWordCOunt
        strSQL = "INSERT INTO tblWord (Word) VALUES(" & Chr(34) & GetCSWord(strText, intLoop) & Chr(34) & ");"
        db.Execute strSQL
    Next intLoop
    DoCmd.OpenQuery "qryFrequency"
sExit:
    On Error Resume Next
    Set db = Nothing
    Reset
    Exit Sub
E_Handle:
    MsgBox Err.Description & vbCrLf & "sCountFrequency", vbOKOnly + vbCritical, "Error: " & Err.Number
    Resume sExit
End Sub
 
Function fReplaceCharacters(ByVal strIn As String, strCharToReplace As String, strCharReplaceWith As String) As String
'   Procedure to replace one set of characters with another in a given string
'   Accepts:
'       strIn - the string that needs to be modified
'       strCharToReplace - the characters that need to be replaced
'       strCharReplaceWith - the characters that are replacing
'   Returns:
'       a string with any occurences of strCharToReplace replaced with strCharReplaceWith

    If InStr(1, strIn, strCharToReplace) > 0 Then
        Do Until InStr(1, strIn, strCharToReplace) = 0
            strIn = Left(strIn, InStr(1, strIn, strCharToReplace) - 1) & strCharReplaceWith & Mid(strIn, InStr(1, strIn, strCharToReplace) + Len(strCharToReplace))
        Loop
    End If
    fReplaceCharacters = strIn
End Function
 
Function CountCSWords(ByVal s) As Integer
'   Counts the words in a string that are separated by commas.
    Dim WC As Integer, Pos As Integer
    If VarType(s) <> 8 Or Len(s) = 0 Then
        CountCSWords = 0
        Exit Function
    End If
    WC = 1
    Pos = InStr(s, " ")
    Do While Pos > 0
        WC = WC + 1
        Pos = InStr(Pos + 1, s, " ")
    Loop
    CountCSWords = WC
End Function
 
Function GetCSWord(ByVal s, Indx As Integer)
'   Returns the nth word in a specific field.
    Dim WC As Integer, Count As Integer
    Dim SPos As Integer, EPos As Integer
    WC = CountCSWords(s)
    If Indx < 1 Or Indx > WC Then
        GetCSWord = Null
        Exit Function
    End If
    Count = 1
    SPos = 1
    For Count = 2 To Indx
        SPos = InStr(SPos, s, " ") + 1
    Next Count
    EPos = InStr(SPos, s, " ") - 1
    If EPos <= 0 Then EPos = Len(s)
    GetCSWord = Trim(Mid(s, SPos, EPos - SPos + 1))
End Function

The code above uses two functions, CountCSWords and GetCSWord, from a Microsoft Knowledge Base article.

Top

 


HOME | NEW | TABLES | QUERIES | FORMS | REPORTS | GENERAL | API | DOWNLOADS | TUTORIAL | RESOURCES
E-MAIL
Copyright & Disclaimer

 

Last modified at 06/06/2006 14:57:20