-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathProblem4_PlagiarismDetectionSystem.java
More file actions
93 lines (64 loc) · 2.91 KB
/
Problem4_PlagiarismDetectionSystem.java
File metadata and controls
93 lines (64 loc) · 2.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import java.util.*;
class PlagiarismDetector {
HashMap<String, HashSet<String>> index = new HashMap<String, HashSet<String>>();
HashMap<String, ArrayList<String>> documents = new HashMap<String, ArrayList<String>>();
int n = 5;
public void addDocument(String docId, String text) {
String[] words = text.toLowerCase().split(" ");
ArrayList<String> ngrams = new ArrayList<String>();
for (int i = 0; i <= words.length - n; i++) {
String gram = "";
for (int j = i; j < i + n; j++) {
gram = gram + words[j] + " ";
}
gram = gram.trim();
ngrams.add(gram);
if (!index.containsKey(gram)) {
index.put(gram, new HashSet<String>());
}
index.get(gram).add(docId);
}
documents.put(docId, ngrams);
}
public void analyzeDocument(String docId) {
ArrayList<String> grams = documents.get(docId);
HashMap<String, Integer> matchCount = new HashMap<String, Integer>();
for (String gram : grams) {
if (index.containsKey(gram)) {
HashSet<String> docs = index.get(gram);
for (String d : docs) {
if (!d.equals(docId)) {
if (!matchCount.containsKey(d)) {
matchCount.put(d, 0);
}
matchCount.put(d, matchCount.get(d) + 1);
}
}
}
}
System.out.println("Extracted " + grams.size() + " n-grams");
for (String d : matchCount.keySet()) {
int matches = matchCount.get(d);
double similarity = (matches * 100.0) / grams.size();
if (similarity > 50) {
System.out.println("Found " + matches + " matching n-grams with \"" + d + "\"");
System.out.println("Similarity: " + similarity + "% (PLAGIARISM DETECTED)");
} else {
System.out.println("Found " + matches + " matching n-grams with \"" + d + "\"");
System.out.println("Similarity: " + similarity + "% (suspicious)");
}
}
}
}
public class Problem4_PlagiarismDetectionSystem {
public static void main(String[] args) {
PlagiarismDetector detector = new PlagiarismDetector();
String essay1 = "data science is an interdisciplinary field that uses scientific methods and algorithms";
String essay2 = "data science is an interdisciplinary field that uses scientific methods for analysis";
String essay3 = "machine learning and artificial intelligence are important parts of modern technology";
detector.addDocument("essay_089.txt", essay1);
detector.addDocument("essay_092.txt", essay2);
detector.addDocument("essay_123.txt", essay1);
detector.analyzeDocument("essay_123.txt");
}
}