public class LuceneHelper extends Object
Modifier and Type | Field and Description |
---|---|
static org.apache.lucene.document.FieldType |
STORE_TERM_VECTORS |
static org.apache.lucene.document.FieldType |
STORE_TERM_VECTORS_NOT_STORED |
Modifier | Constructor and Description |
---|---|
protected |
LuceneHelper(File wikiIdtToLuceneIdSerialization,
File indexPath)
Opens or creates a lucene index in the given directory
|
Modifier and Type | Method and Description |
---|---|
void |
addDocument(it.cnr.isti.hpc.wikipedia.article.Article a)
Indexes a Wikipedia Article
|
protected void |
addDocument(int id,
String content)
Adds a Wikipedia Article (added just for testing)
|
void |
clearIndex()
Clears the index
|
void |
closeWriter() |
void |
commit() |
protected void |
dumpWikiIdToLuceneId()
Dumps the map containing the conversion from the Wikipedia ids to the
Lucene Ids.
|
it.cnr.isti.hpc.wikipedia.article.Article |
getArticle(int id)
Retrieves an article from the index
|
it.cnr.isti.hpc.wikipedia.article.Article |
getArticleSummary(int id)
Retrieves only the article summary and the title from the index
|
double |
getCosineSimilarity(int x,
int y)
Returns the cosine similarity between two documents
|
double |
getCosineSimilarity(int x,
int y,
String field)
Returns the cosine similarity between two documents
|
static LuceneHelper |
getDexterLuceneHelper()
Returns an instance of the Dexter's Lucene index.
|
int |
getFreq(String query) |
int |
getFreq(String query,
String field) |
protected int |
getLuceneId(int wikiId) |
float |
getSimilarity(org.apache.lucene.search.Query query,
int wikiId)
Returns the TFIDF-similarity between a given string and an article
|
int |
getWikiId(int luceneId) |
static boolean |
hasDexterLuceneIndex() |
void |
loadWikiIdToLuceneId()
Loads the map containing the conversion from the Wikipedia ids to the
Lucene Ids.
|
int |
numDocs() |
protected void |
parseWikiIdToLuceneId()
Loads the map containing the conversion from the Wikipedia ids to the
Lucene Ids.
|
List<Integer> |
query(String query) |
List<Integer> |
query(String query,
String field) |
void |
rankBySimilarity(SpotMatch spot,
EntityMatchList eml,
String context)
Sorts a list of entities by their similarity (full text) with the string
context.
|
void |
rankBySimilarity(SpotMatch spot,
EntityMatchList eml,
String context,
String field)
Sorts a list of entities by their similarity with the string context.
|
public static final org.apache.lucene.document.FieldType STORE_TERM_VECTORS
public static final org.apache.lucene.document.FieldType STORE_TERM_VECTORS_NOT_STORED
protected LuceneHelper(File wikiIdtToLuceneIdSerialization, File indexPath)
wikiIdtToLuceneIdSerialization
- - the file containing the serialized mapping between wiki-id
and Lucene documents idsindexPath
- - the path of the directory with the Lucene's indexpublic static boolean hasDexterLuceneIndex()
public static LuceneHelper getDexterLuceneHelper()
protected void parseWikiIdToLuceneId()
protected void dumpWikiIdToLuceneId()
public void loadWikiIdToLuceneId()
protected int getLuceneId(int wikiId)
public float getSimilarity(org.apache.lucene.search.Query query, int wikiId)
query
- - the query containing the query to compare with the articlewikiId
- - the id of the article to compare with the querypublic double getCosineSimilarity(int x, int y)
x
- - the WikiId of the first documenty
- - the WikiId of the first documentpublic double getCosineSimilarity(int x, int y, String field)
x
- - the WikiId of the first documenty
- - the WikiId of the first documentfield
- - the field on which to compute the similaritypublic void addDocument(it.cnr.isti.hpc.wikipedia.article.Article a)
a
- the article to indexprotected void addDocument(int id, String content)
id
- - the id of the Wikipedia Articlecontent
- - the text of the Wikipedia Articlepublic void clearIndex()
public void commit()
public int getFreq(String query, String field)
query
- - a queryfield
- - the field where to search the querypublic int getFreq(String query)
query
- - a queryfield
- - the field on which to perform the querypublic int numDocs()
public void closeWriter()
public List<Integer> query(String query, String field)
public List<Integer> query(String query)
public it.cnr.isti.hpc.wikipedia.article.Article getArticle(int id)
id
- - the Wikipedia Id of the Articlepublic it.cnr.isti.hpc.wikipedia.article.Article getArticleSummary(int id)
id
- - the Wikipedia Id of the Articlepublic int getWikiId(int luceneId)
public void rankBySimilarity(SpotMatch spot, EntityMatchList eml, String context, String field)
spot
- - the spot for which the entities are sortedeml
- - the entity list to sortcontext
- - the context text, entities are sorted based on their
similarity with the context.field
- - sort the entity based on the similarity between their text
in this field and the context.public void rankBySimilarity(SpotMatch spot, EntityMatchList eml, String context)
spot
- - the spot for which the entities are sortedeml
- - the entity list to sortcontext
- - the context text, entities are sorted based on their
similarity with the context.Copyright © 2013. All rights reserved.