smc · ProgramComputer · Apr 15, 2023 · Apr 15, 2023 · Apr 15, 2023 · Apr 15, 2023
diff --git a/Makefile b/Makefile
@@ -19,7 +19,9 @@ test: malayalam.a python
 
 coverage-analysis: malayalam.a python
 	@python tests/coverage-test.py
+	sort -u tests/unanalyzed.lex --output=tests/unanalyzed.lex
+	wc tests/unanalyzed.lex
 
 dataset:
 	pip install tqdm
-	python scripts/create-dataset.py
+	python scripts/create-dataset.py
diff --git a/README.md b/README.md
@@ -126,6 +126,16 @@ The analyser is being developed with lot of tests. To run tests :
 $ make test
 ```
 
+```bash
+$ make coverage-test
+```
+runs a coverage-tests and creates unanalyzed.lex file in tests with unanalyzed words.
+## Dataset
+```bash
+$ make dataset 
+```
+creates a .csv file with words from tests/coverage/*.txt files. 
+
 ## Citation
 
 Please cite the following publication in order to refer to the mlmorph:

diff --git a/tests/coverage-test.py b/tests/coverage-test.py
@@ -29,8 +29,9 @@ def test_total_coverage(self):
         start = clock()
         print("%40s\t%8s\t%8s\t%s" %
               ('File name', 'Words', 'Analysed', 'Percentage'))
-        for filename in glob.glob(os.path.join(CURR_DIR, "coverage", "*.txt")):
-            with open(filename, 'r') as file:
+        with open("./tests/unanalyzed.lex", "w+") as unanFile:
+            for filename in glob.glob(os.path.join(CURR_DIR, "coverage", "*.txt")):
+              with open(filename, 'r') as file:
                 tokens_count = 0
                 analysed_tokens_count = 0
                 for line in file:
@@ -41,12 +42,14 @@ def test_total_coverage(self):
                         analysis = self.analyser.analyse(word, False)
                         if len(analysis) > 0:
                             analysed_tokens_count += 1
+                        else:
+                            unanFile.write(word+"\n")
                 percentage = (analysed_tokens_count/tokens_count)*100
                 total_tokens_count += tokens_count
                 total_analysed_tokens_count += analysed_tokens_count
                 print("%40s\t%8d\t%8d\t%3.2f%%" % (os.path.basename(
                     filename), tokens_count, analysed_tokens_count, percentage))
-                file.close()
+                file.close();
         percentage = (total_analysed_tokens_count/total_tokens_count)*100
         time_taken = clock() - start
         print('%40s\t%8d\t%8d\t%3.2f%%' %