Contributing Guidelines
New contributors are more than welcome. If you want to contribute just fork the project and send a pull request with your changes.
Weka Filter
AffectiveTweets methods extend the Weka Filter class, particularly the SimpleBatchFilter class.
Please read the instructions for implementing a Weka filter from here before continuing.
Implementing a new AffectiveTweets Filter
We will show how to implement a simple filter that adds a new numeric attribute to the given dataset. This attribute will count the number of times the words from a given list occur in a given tweet. The list is given as comma separated string.
New filters can extend the TweetToFeatureVector abstract class to inherit tokenization and many other preprocessing functionalities useful for sentiment analysis of tweets (e.g., reduce repeated letters, standardize URLs) .
/*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/*
* TweetToWordListCountFeatureVector.java
* Copyright (C) 1999-2019 University of Waikato, Hamilton, New Zealand
*
*/
package weka.filters.unsupervised.attribute;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.Arrays;
import weka.core.Attribute;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.OptionMetadata;
import weka.core.SparseInstance;
/**
*
* @author Felipe Bravo-Marquez (fbravoma@waikato.ac.nz)
*/
public class TweetToWordListCountFeatureVector extends TweetToFeatureVector {
/** For serialization. */
private static final long serialVersionUID = -573366510055859430L;
/** The given word list as a comma separated string. */
public String wordList = "love,happy,great";
/**
* Returns a string describing this filter.
*
* @return a description of the filter suitable for displaying in the
* explorer/experimenter gui
*/
@Override
public String globalInfo() {
return "A simple filter that counts occurrences of words from a given list.";
}
/* (non-Javadoc)
* @see weka.filters.SimpleFilter#determineOutputFormat(weka.core.Instances)
*/
@Override
protected Instances determineOutputFormat(Instances inputFormat)
throws Exception {
ArrayList<Attribute> att = new ArrayList<Attribute>();
// Adds all attributes of the inputformat
for (int i = 0; i < inputFormat.numAttributes(); i++) {
att.add(inputFormat.attribute(i));
}
// adds the new attribute
att.add(new Attribute("wordListCount"));
Instances result = new Instances(inputFormat.relationName(), att, 0);
// set the class index
result.setClassIndex(inputFormat.classIndex());
return result;
}
/* (non-Javadoc)
* @see weka.filters.SimpleFilter#process(weka.core.Instances)
*/
@Override
protected Instances process(Instances instances) throws Exception {
// set upper value for text index
m_textIndex.setUpper(instances.numAttributes() - 1);
Instances result = getOutputFormat();
// reference to the content of the message, users index start from zero
Attribute attrCont = instances.attribute(this.m_textIndex.getIndex());
for (int i = 0; i < instances.numInstances(); i++) {
// copy all attribute values from the original dataset
double[] values = new double[result.numAttributes()];
for (int n = 0; n < instances.numAttributes(); n++)
values[n] = instances.instance(i).value(n);
String content = instances.instance(i).stringValue(attrCont);
// tokenize the content
List<String> words = affective.core.Utils.tokenize(content, this.toLowerCase, this.standarizeUrlsUsers, this.reduceRepeatedLetters, this.m_tokenizer,this.m_stemmer,this.m_stopwordsHandler);
// convert the list of words into a HashSet
Set<String> wordSet = new HashSet<String>(Arrays.asList(wordList.split(",")));
// count all the occurrences of words from the list
int wordCounter = 0;
for(String word:words){
if(wordSet.contains(word))
wordCounter++;
}
// add the value to the last attribute
values[values.length - 1] = wordCounter;
Instance inst = new SparseInstance(1, values);
inst.setDataset(result);
// copy possible strings, relational values...
copyValues(inst, false, instances, result);
result.add(inst);
}
return result;
}
/**
* Main method for testing this class.
*
* @param args should contain arguments to the filter: use -h for help
*/
public static void main(String[] args) {
runFilter(new TweetToWordListCountFeatureVector(), args);
}
// OptionMetada allows setting parameters from within the command-line interface
@OptionMetadata(displayName = "wordlist",
description = "The list with the words to count separated by a comma symbol.",
commandLineParamName = "wordlist", commandLineParamSynopsis = "-wordlist <string>",
displayOrder = 6)
public String getWordList() {
return wordList;
}
public void setWordList(String wordList) {
this.wordList = wordList;
}
}
One way to use this new filter class from within Weka, assuming the source code of the class is in the appropriate subfolder of the src folder of the AffectiveTweets project, is to rebuild and reinstall the AffectiveTweets package by using the project’s build-package.xml file with the ant build tool.
Implementing a JUnit Test
To test the new filter we need to implement a JUnit test. A new filter test can extend AbstractFilterTest, which can be found in the Weka source code repository, as shown below:
/*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/*
* Copyright (C) 2019 University of Waikato, Hamilton, New Zealand
*/
package weka.filters.unsupervised.attribute;
import weka.classifiers.meta.FilteredClassifier;
import weka.core.Instances;
import weka.filters.AbstractFilterTest;
import weka.filters.Filter;
import junit.framework.Test;
import junit.framework.TestSuite;
import java.io.File;
/**
* Tests TweetToWordListCountFeatureVectorTest. Run from the command line with: <p/>
* java weka.filters.unsupervised.attribute.TweetToWordListCountFeatureVectorTest
* <p>
* AffectiveTweets package must either be installed or
* JVM must be started in AffectiveTweets directory.
* <p>
* @author FracPete and eibe
* @version $Revision: 9568 $
*/
public class TweetToWordListCountFeatureVectorTest extends AbstractFilterTest {
public TweetToWordListCountFeatureVectorTest(String name) {
super(name);
}
/** Creates a default TweetToSentiStrengthFeatureVector filter */
public Filter getFilter() {
Filter f = null;
// Check to see if the test is run from directory containing build_package.xml
if ((new File(".." + File.separator + "AffectiveTweets" + File.separator + "build_package.xml")).exists()) {
File backup = weka.core.WekaPackageManager.PACKAGES_DIR;
weka.core.WekaPackageManager.PACKAGES_DIR = new java.io.File(".."); // So that default lexicon, etc., is found.
f = new TweetToWordListCountFeatureVector();
weka.core.WekaPackageManager.PACKAGES_DIR = backup;
} else {
f = new TweetToWordListCountFeatureVector(); // Hope that the package is installed.
}
return f;
}
/**
* Test for the FilteredClassifier used with this filter.
*
* @return the configured FilteredClassifier
*/
protected FilteredClassifier getFilteredClassifier() {
FilteredClassifier result;
result = new FilteredClassifier();
weka.filters.MultiFilter mf = new weka.filters.MultiFilter();
Filter[] filters = new Filter[2];
filters[0] = getFilter();
weka.filters.unsupervised.attribute.RemoveType rt = new weka.filters.unsupervised.attribute.RemoveType(); // Need to remove string attributes because they are kept by this filter.
filters[1] = rt;
mf.setFilters(filters);
result.setFilter(mf);
result.setClassifier(new weka.classifiers.functions.SMO());
return result;
}
/**
* Data to be used for FilteredClassifier test.
*
* @return the configured FilteredClassifier
*/
protected Instances getFilteredClassifierData() throws Exception {
Instances result;
// Check to see if the test is run from directory containing build_package.xml
if ((new File(".." + File.separator + "AffectiveTweets" + File.separator + "build_package.xml")).exists()) {
result = (new weka.core.converters.ConverterUtils.DataSource("data" + File.separator + "sent140test.arff.gz")).getDataSet();
} else { // Hope that package is installed.
result = (new weka.core.converters.ConverterUtils.DataSource(weka.core.WekaPackageManager.PACKAGES_DIR.toString() + File.separator + "data" + File.separator + "sent140test.arff.gz")).getDataSet();
}
result.setClassIndex(result.numAttributes() - 1);
return result;
}
/**
* Called by JUnit before each test method. Sets up the Instances object to use based on
* one of the datasets that comes with the package.
*
* @throws Exception if an error occurs reading the example instances.
*/
protected void setUp() throws Exception {
super.setUp();
// Check to see if the test is run from directory containing build_package.xml
if ((new File(".." + File.separator + "AffectiveTweets" + File.separator + "build_package.xml")).exists()) {
m_Instances = (new weka.core.converters.ConverterUtils.DataSource("data" + File.separator + "sent140test.arff.gz")).getDataSet();
} else { // Hope that package is installed.
m_Instances = (new weka.core.converters.ConverterUtils.DataSource(weka.core.WekaPackageManager.PACKAGES_DIR.toString() + File.separator + "data" + File.separator + "sent140test.arff.gz")).getDataSet();
}
m_Instances.setClassIndex(m_Instances.numAttributes() - 1);
}
public static Test suite() {
return new TestSuite(TweetToWordListCountFeatureVectorTest.class);
}
public static void main(String[] args){
junit.textui.TestRunner.run(suite());
}
}