[JustForFunPython] Classifying languages based on the frequency of each alphabet letter

Inside my training folder
from sklearn imoprt svm, metrics 
import glob, os.path, re, json
def check_freq(fname):
name = os.path.basename(fname)
lang = re.match(r'^[a-z]{2,}', name).group()
with open(fname, 'r', encoding = 'utf-8') as f:
text = f.read()
text = text.lower()
def check_freq(fname):
name = os.path.basename(fname)
lang = re.match(r'^[a-z]{2,}', name).group()
with open(fname, 'r', encoding = 'utf-8') as f:
text = f.read()
text = text.lower()
cnt = [0 for n in range(0, 26)]
code_a = ord('a')
code_b = ord('z')
for ch in text:
n = ord(ch)
if code_a <= n <= code_z:
cnt[n-code_a] += 1
def check_freq(fname):
name = os.path.basename(fname)
lang = re.match(r'^[a-z]{2,}', name).group()
with open(fname, 'r', encoding = 'utf-8') as f:
text = f.read()
text = text.lower()
cnt = [0 for n in range(0, 26)]
code_a = ord('a')
code_b = ord('z')
for ch in text:
n = ord(ch)
if code_a <= n <= code_z:
cnt[n-code_a] += 1
total = sum(cnt)
freq = list(map(lambda n: n/total, cnt))
return (freq, lang)
def load_files(path):
freqs = []
labels = []
file_list = glob.glob(path)
for fname in file_list:
r = check_freq(fname)
freqs.append(r[0])
labels.append(r[1])
return {"freqs": freqs, "labels": labels}
data = load_files("./train/*.txt")
test = load_files("./test/*.txt")
with open("freq.json", "w", encoding = 'utf-8') as f:
json.dump([data, test], f)
clf = svm.SVC(gamma = 'auto') 
clf.fit(data['freq'], data['labels'])
predict = clf.predict(test['freqs'])
accuracy_score = metrics.accuracy_score(test['labels'], predict) 
classification_report = metrics.classification_report(test['labels'], predict)

--

--

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store