From 3b62e75e727195af3c47769fb6b82dd688abd4c2 Mon Sep 17 00:00:00 2001 From: Jatin Saxena Date: Sat, 13 Nov 2021 13:43:23 -0600 Subject: [PATCH 1/5] Intermediate commit --- code/parsing-engine/build.gradle | 5 + .../java/analysisengine/ScoringEngine.java | 144 ++++++++++++++++++ 2 files changed, 149 insertions(+) create mode 100644 code/parsing-engine/src/main/java/analysisengine/ScoringEngine.java diff --git a/code/parsing-engine/build.gradle b/code/parsing-engine/build.gradle index c99aab7116..bb0c32b6e6 100644 --- a/code/parsing-engine/build.gradle +++ b/code/parsing-engine/build.gradle @@ -17,6 +17,11 @@ repositories { } dependencies { + implementation group: 'org.apache.lucene', name: 'lucene-core', version: '4.7.2' + implementation group: 'org.apache.lucene', name: 'lucene-queryparser', version: '4.7.2' + implementation group: 'org.apache.lucene', name: 'lucene-analyzers-common', version: '4.7.2' + + // Use JUnit Jupiter API for testing. testImplementation 'org.junit.jupiter:junit-jupiter-api:5.6.2' diff --git a/code/parsing-engine/src/main/java/analysisengine/ScoringEngine.java b/code/parsing-engine/src/main/java/analysisengine/ScoringEngine.java new file mode 100644 index 0000000000..4a4f476147 --- /dev/null +++ b/code/parsing-engine/src/main/java/analysisengine/ScoringEngine.java @@ -0,0 +1,144 @@ +package analysisengine; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.core.SimpleAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.queryparser.classic.QueryParser; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TopScoreDocCollector; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.Version; + +import java.io.File; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.util.ArrayList; +import java.util.logging.Level; +import java.util.logging.Logger; + +public class ScoringEngine { + private static final Logger LOGGER = Logger.getLogger(ScoringEngine.class.getName()); + + private static Analyzer analyzer = new SimpleAnalyzer(Version.LUCENE_47); + private IndexWriter writer; + private ArrayList queue = new ArrayList<>(); + private static final String CORPUS_PATH = "/Users/jatinsaxena/IdeaProjects/CourseProject/code/parsing-engine/src/main/resources/CORPUS"; + + ScoringEngine() throws IOException { + FSDirectory dir = FSDirectory.open(new File(CORPUS_PATH)); + IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_47,analyzer); + writer = new IndexWriter(dir, config); + } + + public void indexFilesDirectory() throws IOException { + + addFiles(new File(getClass().getResource("/CORPUS").getFile())); + + int OriginalNumDocs = writer.numDocs(); + + queue.forEach(file -> { + FileReader fr = null; + try { + fr = new FileReader(file); + Document document = new Document(); + //Add content from json file + document.add(new TextField("contents",fr)); + document.add(new StringField("path", file.getPath(), Field.Store.YES)); + document.add(new StringField("filename", file.getName(), Field.Store.YES)); + writer.addDocument(document); + } catch (IOException e) { + e.printStackTrace(); + } + finally { + try { + fr.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + }); + int newNumDocs = writer.numDocs(); + + if(OriginalNumDocs == newNumDocs) { + LOGGER.log(Level.INFO,"All documents get indexed"); + } + else { + LOGGER.log(Level.INFO,"{} documents get indexed",OriginalNumDocs-newNumDocs); + } + queue.clear(); + writer.close(); + + + } + + public void searchQuery(String userQuery) throws IOException { + IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(CORPUS_PATH))); + IndexSearcher searcher = new IndexSearcher(reader); + TopScoreDocCollector collector = null; + + File docFile = new File(getClass().getResource(".").getFile() + "/ScoreList.txt"); + FileWriter docFileWriter = new FileWriter(docFile); + try { + Query q = new QueryParser(Version.LUCENE_47, "contents", analyzer).parse(userQuery); + collector = TopScoreDocCollector.create(1000,true); //Scoring for all the documents. + searcher.search(q, collector); + ScoreDoc[] hits = collector.topDocs().scoreDocs; + + for (int i = 0; i < Math.min(100, hits.length); ++i) { + int docId = hits[i].doc; + Document d = searcher.doc(docId); + String filename = d.get("filename"); + System.out.println("Filename--->>>" + filename + "Score-->>>" + hits[i].score); + filename = filename.substring(0, filename.length() - 4); + String concatenatedOutput = filename + " " + (i + 1) + " " + hits[i].score + " " + " LuceneModel\n"; + docFileWriter.write(concatenatedOutput); + } + + } + catch (Exception e) { + e.printStackTrace(); + } + docFileWriter.close(); + } + + private void addFiles(File file) { + + if (!file.exists()) { + System.out.println(file + " does not exist."); + } + if (file.isDirectory()) { + for (File f : file.listFiles()) { + addFiles(f); + } + } else { + String filename = file.getName().toLowerCase(); + // =================================================== + // Only index text files + // =================================================== + if (filename.endsWith(".txt")) { + queue.add(file); + } else { + System.out.println("Skipped " + filename); + } + } + } + + public static void main(String args[]) throws IOException { + ScoringEngine scoringEngine = new ScoringEngine(); + scoringEngine.indexFilesDirectory(); + scoringEngine.searchQuery("what articles exist which deal with tss time sharing system an operating system for ibm computers \n"); + } + + + +} From fbf8de35191421562f5dad37feca6c372f26a0cb Mon Sep 17 00:00:00 2001 From: Jatin Saxena Date: Sun, 14 Nov 2021 22:34:54 -0600 Subject: [PATCH 2/5] Basic search implementation --- .../java/analysisengine/ScoringEngine.java | 24 +++++++++---------- .../src/main/resources/CORPUS/REsume_Java.txt | 6 +++++ .../src/main/resources/CORPUS/Resume_UI.txt | 3 +++ 3 files changed, 20 insertions(+), 13 deletions(-) create mode 100644 code/parsing-engine/src/main/resources/CORPUS/REsume_Java.txt create mode 100644 code/parsing-engine/src/main/resources/CORPUS/Resume_UI.txt diff --git a/code/parsing-engine/src/main/java/analysisengine/ScoringEngine.java b/code/parsing-engine/src/main/java/analysisengine/ScoringEngine.java index 4a4f476147..f226693424 100644 --- a/code/parsing-engine/src/main/java/analysisengine/ScoringEngine.java +++ b/code/parsing-engine/src/main/java/analysisengine/ScoringEngine.java @@ -20,24 +20,25 @@ import java.io.File; import java.io.FileReader; -import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; +import java.util.List; import java.util.logging.Level; import java.util.logging.Logger; + public class ScoringEngine { private static final Logger LOGGER = Logger.getLogger(ScoringEngine.class.getName()); private static Analyzer analyzer = new SimpleAnalyzer(Version.LUCENE_47); private IndexWriter writer; private ArrayList queue = new ArrayList<>(); - private static final String CORPUS_PATH = "/Users/jatinsaxena/IdeaProjects/CourseProject/code/parsing-engine/src/main/resources/CORPUS"; ScoringEngine() throws IOException { - FSDirectory dir = FSDirectory.open(new File(CORPUS_PATH)); + FSDirectory dir = FSDirectory.open(new File(getClass().getResource("/CORPUS").getFile())); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_47,analyzer); writer = new IndexWriter(dir, config); + //TODO - Implement Similarity } public void indexFilesDirectory() throws IOException { @@ -81,13 +82,12 @@ public void indexFilesDirectory() throws IOException { } - public void searchQuery(String userQuery) throws IOException { - IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(CORPUS_PATH))); + public List searchQuery(String userQuery) throws IOException { + IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(getClass().getResource("/CORPUS").getFile()))); IndexSearcher searcher = new IndexSearcher(reader); TopScoreDocCollector collector = null; + List resultDocList = new ArrayList<>(); - File docFile = new File(getClass().getResource(".").getFile() + "/ScoreList.txt"); - FileWriter docFileWriter = new FileWriter(docFile); try { Query q = new QueryParser(Version.LUCENE_47, "contents", analyzer).parse(userQuery); collector = TopScoreDocCollector.create(1000,true); //Scoring for all the documents. @@ -97,18 +97,16 @@ public void searchQuery(String userQuery) throws IOException { for (int i = 0; i < Math.min(100, hits.length); ++i) { int docId = hits[i].doc; Document d = searcher.doc(docId); + resultDocList.add(d); String filename = d.get("filename"); System.out.println("Filename--->>>" + filename + "Score-->>>" + hits[i].score); - filename = filename.substring(0, filename.length() - 4); - String concatenatedOutput = filename + " " + (i + 1) + " " + hits[i].score + " " + " LuceneModel\n"; - docFileWriter.write(concatenatedOutput); } - } catch (Exception e) { e.printStackTrace(); } - docFileWriter.close(); + System.out.println("Result Document-->>" + resultDocList); + return resultDocList; } private void addFiles(File file) { @@ -136,7 +134,7 @@ private void addFiles(File file) { public static void main(String args[]) throws IOException { ScoringEngine scoringEngine = new ScoringEngine(); scoringEngine.indexFilesDirectory(); - scoringEngine.searchQuery("what articles exist which deal with tss time sharing system an operating system for ibm computers \n"); + scoringEngine.searchQuery("java \n"); } diff --git a/code/parsing-engine/src/main/resources/CORPUS/REsume_Java.txt b/code/parsing-engine/src/main/resources/CORPUS/REsume_Java.txt new file mode 100644 index 0000000000..690f6fd9f0 --- /dev/null +++ b/code/parsing-engine/src/main/resources/CORPUS/REsume_Java.txt @@ -0,0 +1,6 @@ +Java 5 years +Javascript 10 years +Spark 5 years +================== + +Java 2 years \ No newline at end of file diff --git a/code/parsing-engine/src/main/resources/CORPUS/Resume_UI.txt b/code/parsing-engine/src/main/resources/CORPUS/Resume_UI.txt new file mode 100644 index 0000000000..0986fd4320 --- /dev/null +++ b/code/parsing-engine/src/main/resources/CORPUS/Resume_UI.txt @@ -0,0 +1,3 @@ +ExtJS 5 years +Angular 10 years +Java 2 years From 6d5793ce1f87ceb72ea0d0be8d6b358afb26f60d Mon Sep 17 00:00:00 2001 From: Jatin Saxena Date: Thu, 18 Nov 2021 22:53:59 -0600 Subject: [PATCH 3/5] sample input change --- code/parsing-engine/src/main/resources/CORPUS/Resume_UI.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/code/parsing-engine/src/main/resources/CORPUS/Resume_UI.txt b/code/parsing-engine/src/main/resources/CORPUS/Resume_UI.txt index 0986fd4320..4ed03b354f 100644 --- a/code/parsing-engine/src/main/resources/CORPUS/Resume_UI.txt +++ b/code/parsing-engine/src/main/resources/CORPUS/Resume_UI.txt @@ -1,3 +1,5 @@ ExtJS 5 years Angular 10 years Java 2 years +Javascript 2 years +Javascript 2 years From 19eb9e11743a4d32250a10a95a2bd6318478b703 Mon Sep 17 00:00:00 2001 From: Jatin Saxena Date: Fri, 26 Nov 2021 23:15:56 -0600 Subject: [PATCH 4/5] Scoring implementation --- code/parsing-engine/build.gradle | 3 + .../java/analysisengine/ScoringEngine.java | 112 ++++++++++++------ .../src/main/resources/CORPUS/REsume_Java.txt | 11 +- .../src/main/resources/CORPUS/Resume_UI.txt | 9 +- 4 files changed, 86 insertions(+), 49 deletions(-) diff --git a/code/parsing-engine/build.gradle b/code/parsing-engine/build.gradle index bb0c32b6e6..cf835d02b8 100644 --- a/code/parsing-engine/build.gradle +++ b/code/parsing-engine/build.gradle @@ -20,6 +20,9 @@ dependencies { implementation group: 'org.apache.lucene', name: 'lucene-core', version: '4.7.2' implementation group: 'org.apache.lucene', name: 'lucene-queryparser', version: '4.7.2' implementation group: 'org.apache.lucene', name: 'lucene-analyzers-common', version: '4.7.2' + implementation group: 'com.googlecode.json-simple', name: 'json-simple', version: '1.1.1' + + // Use JUnit Jupiter API for testing. diff --git a/code/parsing-engine/src/main/java/analysisengine/ScoringEngine.java b/code/parsing-engine/src/main/java/analysisengine/ScoringEngine.java index f226693424..7872b9cb14 100644 --- a/code/parsing-engine/src/main/java/analysisengine/ScoringEngine.java +++ b/code/parsing-engine/src/main/java/analysisengine/ScoringEngine.java @@ -4,25 +4,23 @@ import org.apache.lucene.analysis.core.SimpleAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.document.IntField; import org.apache.lucene.document.StringField; -import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.queryparser.classic.QueryParser; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.search.TopScoreDocCollector; +import org.apache.lucene.search.*; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; import java.io.File; import java.io.FileReader; import java.io.IOException; -import java.util.ArrayList; -import java.util.List; +import java.util.*; import java.util.logging.Level; import java.util.logging.Logger; @@ -38,7 +36,6 @@ public class ScoringEngine { FSDirectory dir = FSDirectory.open(new File(getClass().getResource("/CORPUS").getFile())); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_47,analyzer); writer = new IndexWriter(dir, config); - //TODO - Implement Similarity } public void indexFilesDirectory() throws IOException { @@ -47,26 +44,52 @@ public void indexFilesDirectory() throws IOException { int OriginalNumDocs = writer.numDocs(); + + queue.forEach(file -> { - FileReader fr = null; try { - fr = new FileReader(file); + + FileReader fr = new FileReader(file); + Object obj = new JSONParser().parse(fr); + JSONObject jo = (JSONObject) obj; + String location = (String) jo.get("location"); + JSONArray ja = (JSONArray) jo.get("skills"); + String allSkills = " "; + Iterator itr2 = ja.iterator(); Document document = new Document(); - //Add content from json file - document.add(new TextField("contents",fr)); document.add(new StringField("path", file.getPath(), Field.Store.YES)); document.add(new StringField("filename", file.getName(), Field.Store.YES)); - writer.addDocument(document); - } catch (IOException e) { - e.printStackTrace(); - } - finally { - try { - fr.close(); - } catch (IOException e) { - e.printStackTrace(); + document.add(new StringField("location", location, Field.Store.YES)); + while (itr2.hasNext()) { + Iterator itr1 = ((Map) itr2.next()).entrySet().iterator(); + int duration=0; + String skill = null; + while (itr1.hasNext()) { + Map.Entry pair = itr1.next(); + if(pair.getKey().toString().equalsIgnoreCase("duration")) { + duration = Integer.parseInt(pair.getValue().toString()); + } + if(pair.getKey().toString().equalsIgnoreCase("skill")) { + skill = pair.getValue().toString(); + } + + } + allSkills = allSkills + skill; + String skills = skill+"_FIELD"; + System.out.println("SKill with DUration-->>" + skills + " " + duration); + document.add(new IntField(skills, duration ,Field.Store.YES)); + + document.add(new StringField("allSkills", allSkills, Field.Store.YES)); + writer.addDocument(document); } + fr.close(); } + catch (Exception e) { + e.printStackTrace(); + } + + + }); int newNumDocs = writer.numDocs(); @@ -77,36 +100,51 @@ public void indexFilesDirectory() throws IOException { LOGGER.log(Level.INFO,"{} documents get indexed",OriginalNumDocs-newNumDocs); } queue.clear(); + writer.commit(); writer.close(); } - public List searchQuery(String userQuery) throws IOException { + public Set searchQuery(String userQuery) throws IOException { IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(getClass().getResource("/CORPUS").getFile()))); IndexSearcher searcher = new IndexSearcher(reader); + + Query query = NumericRangeQuery.newIntRange("Java_FIELD",5,50,true,true); + Query query2 = NumericRangeQuery.newIntRange("Kafka_FIELD", 1,30,true,true); + Query query3 = NumericRangeQuery.newIntRange("Angular_FIELD", 10,100,true,true); + query.setBoost((float) 2.0); + + BooleanQuery booleanQuery = new BooleanQuery(); + booleanQuery.add(query, BooleanClause.Occur.SHOULD); + booleanQuery.add(query2, BooleanClause.Occur.SHOULD); + booleanQuery.add(query3, BooleanClause.Occur.SHOULD); + + TopScoreDocCollector collector = null; - List resultDocList = new ArrayList<>(); + HashSet resultset = new LinkedHashSet<>(); try { - Query q = new QueryParser(Version.LUCENE_47, "contents", analyzer).parse(userQuery); - collector = TopScoreDocCollector.create(1000,true); //Scoring for all the documents. - searcher.search(q, collector); + collector = TopScoreDocCollector.create(100,true); //Scoring for all the documents. + searcher.search(booleanQuery, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; - for (int i = 0; i < Math.min(100, hits.length); ++i) { + for (int i = 0; i < Math.min(50, hits.length); ++i) { int docId = hits[i].doc; Document d = searcher.doc(docId); - resultDocList.add(d); - String filename = d.get("filename"); - System.out.println("Filename--->>>" + filename + "Score-->>>" + hits[i].score); + resultset.add(d.get("location")); + String location = d.get("location"); + System.out.println("File location--->>>" + location + " Score-->>>" + hits[i].score); } + resultset.forEach(doc -> { + System.out.println("New location--->>>" + doc); + }); } catch (Exception e) { e.printStackTrace(); } - System.out.println("Result Document-->>" + resultDocList); - return resultDocList; + System.out.println("Result Document-->>" + resultset); + return resultset; } private void addFiles(File file) { @@ -123,7 +161,7 @@ private void addFiles(File file) { // =================================================== // Only index text files // =================================================== - if (filename.endsWith(".txt")) { + if (filename.endsWith(".json")) { queue.add(file); } else { System.out.println("Skipped " + filename); @@ -134,9 +172,7 @@ private void addFiles(File file) { public static void main(String args[]) throws IOException { ScoringEngine scoringEngine = new ScoringEngine(); scoringEngine.indexFilesDirectory(); - scoringEngine.searchQuery("java \n"); + scoringEngine.searchQuery("Java"); } - - - + } diff --git a/code/parsing-engine/src/main/resources/CORPUS/REsume_Java.txt b/code/parsing-engine/src/main/resources/CORPUS/REsume_Java.txt index 690f6fd9f0..b544d529d4 100644 --- a/code/parsing-engine/src/main/resources/CORPUS/REsume_Java.txt +++ b/code/parsing-engine/src/main/resources/CORPUS/REsume_Java.txt @@ -1,6 +1,5 @@ -Java 5 years -Javascript 10 years -Spark 5 years -================== - -Java 2 years \ No newline at end of file +Java 15 +Javascript 2 +Spark 5 +ExtJS 1 +Angular 1 \ No newline at end of file diff --git a/code/parsing-engine/src/main/resources/CORPUS/Resume_UI.txt b/code/parsing-engine/src/main/resources/CORPUS/Resume_UI.txt index 4ed03b354f..765a1f2a91 100644 --- a/code/parsing-engine/src/main/resources/CORPUS/Resume_UI.txt +++ b/code/parsing-engine/src/main/resources/CORPUS/Resume_UI.txt @@ -1,5 +1,4 @@ -ExtJS 5 years -Angular 10 years -Java 2 years -Javascript 2 years -Javascript 2 years +ExtJS 5 +Angular 10 +Java 2 +Javascript 2 \ No newline at end of file From f71b97a69a7a476b1c5ecc6ba3667b230c0dd7c4 Mon Sep 17 00:00:00 2001 From: Jatin Saxena Date: Sat, 27 Nov 2021 02:29:52 -0600 Subject: [PATCH 5/5] Refined scoring implementation --- code/parsing-engine/build.gradle | 7 +- .../analysisengine/ScoringEngine.java | 80 +++++++++++-------- .../edu/illinois/phantom/model/UserQuery.java | 42 ++++++++++ 3 files changed, 93 insertions(+), 36 deletions(-) rename code/parsing-engine/src/main/java/{ => edu/illinois/phantom}/analysisengine/ScoringEngine.java (67%) create mode 100644 code/parsing-engine/src/main/java/edu/illinois/phantom/model/UserQuery.java diff --git a/code/parsing-engine/build.gradle b/code/parsing-engine/build.gradle index 8c7b25ed7f..3be30d04b5 100644 --- a/code/parsing-engine/build.gradle +++ b/code/parsing-engine/build.gradle @@ -14,12 +14,13 @@ plugins { repositories { // Use JCenter for resolving dependencies. jcenter() + mavenCentral() } dependencies { - implementation group: 'org.apache.lucene', name: 'lucene-core', version: '4.7.2' - implementation group: 'org.apache.lucene', name: 'lucene-queryparser', version: '4.7.2' - implementation group: 'org.apache.lucene', name: 'lucene-analyzers-common', version: '4.7.2' + implementation group: 'org.apache.lucene', name: 'lucene-core', version: '8.1.0' + implementation group: 'org.apache.lucene', name: 'lucene-queryparser', version: '8.1.0' + implementation group: 'org.apache.lucene', name: 'lucene-analyzers-common', version: '8.1.0' implementation group: 'com.googlecode.json-simple', name: 'json-simple', version: '1.1.1' diff --git a/code/parsing-engine/src/main/java/analysisengine/ScoringEngine.java b/code/parsing-engine/src/main/java/edu/illinois/phantom/analysisengine/ScoringEngine.java similarity index 67% rename from code/parsing-engine/src/main/java/analysisengine/ScoringEngine.java rename to code/parsing-engine/src/main/java/edu/illinois/phantom/analysisengine/ScoringEngine.java index 7872b9cb14..1cb4265bbd 100644 --- a/code/parsing-engine/src/main/java/analysisengine/ScoringEngine.java +++ b/code/parsing-engine/src/main/java/edu/illinois/phantom/analysisengine/ScoringEngine.java @@ -1,18 +1,15 @@ -package analysisengine; +package edu.illinois.phantom.analysisengine; +import edu.illinois.phantom.model.UserQuery; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.SimpleAnalyzer; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.IntField; -import org.apache.lucene.document.StringField; +import org.apache.lucene.document.*; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.search.*; import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.util.Version; import org.json.simple.JSONArray; import org.json.simple.JSONObject; import org.json.simple.parser.JSONParser; @@ -20,21 +17,21 @@ import java.io.File; import java.io.FileReader; import java.io.IOException; +import java.nio.file.Paths; import java.util.*; -import java.util.logging.Level; import java.util.logging.Logger; public class ScoringEngine { private static final Logger LOGGER = Logger.getLogger(ScoringEngine.class.getName()); - private static Analyzer analyzer = new SimpleAnalyzer(Version.LUCENE_47); + private static Analyzer analyzer = new SimpleAnalyzer(); private IndexWriter writer; private ArrayList queue = new ArrayList<>(); ScoringEngine() throws IOException { - FSDirectory dir = FSDirectory.open(new File(getClass().getResource("/CORPUS").getFile())); - IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_47,analyzer); + FSDirectory dir = FSDirectory.open(Paths.get(getClass().getResource("/CORPUS").getFile())); + IndexWriterConfig config = new IndexWriterConfig(analyzer); writer = new IndexWriter(dir, config); } @@ -42,9 +39,6 @@ public void indexFilesDirectory() throws IOException { addFiles(new File(getClass().getResource("/CORPUS").getFile())); - int OriginalNumDocs = writer.numDocs(); - - queue.forEach(file -> { try { @@ -75,10 +69,11 @@ public void indexFilesDirectory() throws IOException { } allSkills = allSkills + skill; - String skills = skill+"_FIELD"; - System.out.println("SKill with DUration-->>" + skills + " " + duration); - document.add(new IntField(skills, duration ,Field.Store.YES)); + String skills = skill.toUpperCase()+"_FIELD"; + //document.add(new LegacyIntField(skills, duration ,Field.Store.YES)); + document.add(new IntPoint(skills, duration)); + document.add(new StoredField(skills,duration)); document.add(new StringField("allSkills", allSkills, Field.Store.YES)); writer.addDocument(document); } @@ -88,17 +83,8 @@ public void indexFilesDirectory() throws IOException { e.printStackTrace(); } - - }); - int newNumDocs = writer.numDocs(); - if(OriginalNumDocs == newNumDocs) { - LOGGER.log(Level.INFO,"All documents get indexed"); - } - else { - LOGGER.log(Level.INFO,"{} documents get indexed",OriginalNumDocs-newNumDocs); - } queue.clear(); writer.commit(); writer.close(); @@ -106,11 +92,15 @@ public void indexFilesDirectory() throws IOException { } - public Set searchQuery(String userQuery) throws IOException { - IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(getClass().getResource("/CORPUS").getFile()))); + public Set searchQuery(List userQuery) throws IOException { + IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(getClass().getResource("/CORPUS").getFile()))); IndexSearcher searcher = new IndexSearcher(reader); - Query query = NumericRangeQuery.newIntRange("Java_FIELD",5,50,true,true); +// Query query = IntRange.newWithinQuery("Java_FIELD",new int[] {5},new int[] {Integer.MAX_VALUE}); +// Query query2 = IntRange.newWithinQuery("Kafka_FIELD", new int[] {1},new int[] {Integer.MAX_VALUE}); +// Query query3 = IntRange.newWithinQuery("Angular_FIELD", new int[] {10},new int[] {Integer.MAX_VALUE}); + + /*Query query = NumericRangeQuery.newIntRange("Java_FIELD",5,50,true,true); Query query2 = NumericRangeQuery.newIntRange("Kafka_FIELD", 1,30,true,true); Query query3 = NumericRangeQuery.newIntRange("Angular_FIELD", 10,100,true,true); query.setBoost((float) 2.0); @@ -120,12 +110,29 @@ public Set searchQuery(String userQuery) throws IOException { booleanQuery.add(query2, BooleanClause.Occur.SHOULD); booleanQuery.add(query3, BooleanClause.Occur.SHOULD); +*/ + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + userQuery.forEach(inputQuery -> { + Query query; + if(inputQuery.isMandatorySkill()) { + query = new BoostQuery(IntPoint.newRangeQuery(inputQuery.getSkill(), inputQuery.getMinExperience() + , Integer.MAX_VALUE), (float) inputQuery.getMinExperience()); + } + else { + query = IntPoint.newRangeQuery(inputQuery.getSkill(), inputQuery.getMinExperience() + , Integer.MAX_VALUE); + } + + builder.add(query,BooleanClause.Occur.SHOULD); + }); + + BooleanQuery booleanQuery = builder.build(); TopScoreDocCollector collector = null; HashSet resultset = new LinkedHashSet<>(); try { - collector = TopScoreDocCollector.create(100,true); //Scoring for all the documents. + collector = TopScoreDocCollector.create(100,Integer.MAX_VALUE); //Scoring for all the documents. searcher.search(booleanQuery, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; @@ -136,9 +143,6 @@ public Set searchQuery(String userQuery) throws IOException { String location = d.get("location"); System.out.println("File location--->>>" + location + " Score-->>>" + hits[i].score); } - resultset.forEach(doc -> { - System.out.println("New location--->>>" + doc); - }); } catch (Exception e) { e.printStackTrace(); @@ -172,7 +176,17 @@ private void addFiles(File file) { public static void main(String args[]) throws IOException { ScoringEngine scoringEngine = new ScoringEngine(); scoringEngine.indexFilesDirectory(); - scoringEngine.searchQuery("Java"); + //TODO: Remove Later + UserQuery query1 = new UserQuery("JAVA",15,true); + UserQuery query2 = new UserQuery("KAFKA",5,true); + UserQuery query3 = new UserQuery("ANGULAR",2,false); + + ArrayList userQueryArrayList = new ArrayList<>(); + userQueryArrayList.add(query1); + userQueryArrayList.add(query2); + userQueryArrayList.add(query3); + + scoringEngine.searchQuery(userQueryArrayList); } } diff --git a/code/parsing-engine/src/main/java/edu/illinois/phantom/model/UserQuery.java b/code/parsing-engine/src/main/java/edu/illinois/phantom/model/UserQuery.java new file mode 100644 index 0000000000..cae5d2165d --- /dev/null +++ b/code/parsing-engine/src/main/java/edu/illinois/phantom/model/UserQuery.java @@ -0,0 +1,42 @@ +package edu.illinois.phantom.model; + +import lombok.Builder; +import lombok.ToString; + +@Builder +@ToString +public class UserQuery { + private String skill; + private int minExperience; + boolean mandatorySkill; + + public UserQuery(String skill, int minExperience, boolean mandatorySkill) { + this.skill = skill.toUpperCase()+"_FIELD"; + this.minExperience = minExperience; + this.mandatorySkill = mandatorySkill; + } + + public String getSkill() { + return skill; + } + + public void setSkill(String skill) { + this.skill = skill.toUpperCase()+"_FIELD";; + } + + public int getMinExperience() { + return minExperience; + } + + public void setMinExperience(int minExperience) { + this.minExperience = minExperience; + } + + public boolean isMandatorySkill() { + return mandatorySkill; + } + + public void setMandatorySkill(boolean mandatorySkill) { + this.mandatorySkill = mandatorySkill; + } +}