pubs.bib


@INPROCEEDINGS{Sountsov16,
  AUTHOR = {Pavel Sountsov and Sunita Sarawagi},
  TITLE = {Length bias in Encoder Decoder Models and a Case for Global Conditioning},
  BOOKTITLE = {EMNLP},
  YEAR = 2016,
  DOCUMENTURL = {https://arxiv.org/abs/1606.03402},
  ABSTRACT = {Encoder-decoder networks are popular for modeling sequences probabilistically in many applications. These models use the power of the Long Short-Term Memory (LSTM) architecture to capture the full dependence among variables, unlike earlier models like CRFs that typically assumed conditional independence among non-adjacent variables. However in practice encoder-decoder models exhibit a bias towards short sequences that surprisingly gets worse with increasing beam size. 
In this paper we show that such phenomenon is due to a discrepancy between the full sequence margin and the per-element margin enforced by the locally conditioned training objective of a encoder-decoder model. The discrepancy more adversely impacts long sequences, explaining the bias towards predicting short sequences. 
For the case where the predicted sequences come from a closed set, we show that a globally conditioned model alleviates the above problems of encoder-decoder models. From a practical point of view, our proposed model also eliminates the need for a beam-search during inference, which reduces to an efficient dot-product based search in a vector-space.}
}


@INPROCEEDINGS{Iyer16,
  AUTHOR = {Arun Iyer and Saketh Nath and Sunita Sarawagi},
  TITLE = {Privacy-preserving Class Ratio Estimation},
  BOOKTITLE = {ACM SIGKDD},
  YEAR = 2016,
  DOCUMENTURL = {http://www.kdd.org/kdd2016/papers/files/rfp1172-iyerA.pdf},
  ABSTRACT = {In this paper we present learning models for the class ratio estimation problem, which takes as input an unlabeled set of instances and predicts the proportions of instances in the set belonging to the different classes. This problem has applications in social and commercial data analysis. Existing models for class-ratio estimation however require instance-level supervision. Whereas in domains like politics, and demography, set-level supervision is more common. We present a new method for directly estimating class-ratios using set-level supervision. Another serious limitation in applying these techniques to sensitive domains like health is data privacy. We propose a novel label privacy-preserving mechanism that is well-suited for supervised class ratio estimation and has guarantees for achieving efficient differential privacy, provided the per-class counts are large enough. We derive learning bounds for the estimation with and with-out privacy constraints, which lead to important insights for the data-publisher. Extensive empirical evaluation shows that our model is more accurate than existing methods and that the proposed privacy mechanism and learning model are well-suited for each other.}
}


@INPROCEEDINGS{HalevyNSWY16,
  AUTHOR = {Alon Y. Halevy and
               Natalya Fridman Noy and
               Sunita Sarawagi and
               Steven Euijong Whang and
               Xiao Yu},
  TITLE = {Discovering Structure in the Universe of Attribute Names},
  BOOKTITLE = {WWW},
  YEAR = {2016},
  DOCUMENTURL = {http://research.google.com/pubs/archive/44320.pdf},
  ABSTRACT = {Recently, search engines have invested significant effort to answering entity--attribute queries from structured data, but have focused mostly on queries for frequently occurring attributes. In parallel, several research efforts have demonstrated that there is a long tail of attributes, often thousands per class of entities, that are of interest to users. Researchers are beginning to leverage these new collections of attributes to expand the ontologies that power search engines and to recognize entity--attribute queries. Because of the sheer number of potential attributes, such tasks require us to impose some structure on this long and heavy tail of attributes. This paper introduces the problem of organizing the attributes by expressing the compositional structure of their names as a rule-based grammar. These rules offer a compact and rich semantic interpretation of multi-word attributes, while generalizing from the observed attributes to new unseen ones. The paper describes an unsupervised learning method to generate such a grammar automatically from a large set of attribute names. Experiments show that our method can discover a precise grammar over 100,000 attributes of {\sc Countries} while providing a 40-fold compaction over the attribute names. Furthermore, our grammar enables us to increase the precision of attributes from 47\% to more than 90\% with only a minimal curation effort. Thus, our approach provides an efficient and scalable way to expand ontologies with attributes of user interest.}
}


@INPROCEEDINGS{MadaanMMRS16,
  AUTHOR = {Aman Madaan and
               Ashish Mittal and
               Mausam and
               Ganesh Ramakrishnan and
               Sunita Sarawagi},
  TITLE = {Numerical Relation Extraction with Minimal Supervision},
  BOOKTITLE = {AAAI Conference on Artificial Intelligence},
  YEAR = {2016},
  DOCUMENTURL = {http://www.aaai.org/ocs/index.php/AAAI/AAAI16/paper/view/12486/12020},
  ABSTRACT = {We study a novel task of numerical relation extraction with the goal of extracting relations where one of the arguments is a number or a quantity ( e.g., atomic_number(Aluminium, 13), inflation_rate(India, 10.9\%)). This task presents peculiar challenges not found in standard IE, such as the difficulty of matching numbers in distant supervision and the importance of units. We design two extraction systems that require minimal human supervision per relation: (1) NumberRule, a rule based extractor, and (2) NumberTron, a probabilistic graphical model. We find that both systems dramatically outperform MultiR, a state-of-the-art non-numerical IE model, obtaining up to 25 points F-score improvement.}
}


@INPROCEEDINGS{TrummerHLSG15,
  AUTHOR = {Immanuel Trummer and
               Alon Y. Halevy and
               Hongrae Lee and
               Sunita Sarawagi and
               Rahul Gupta},
  TITLE = {Mining Subjective Properties on the Web},
  BOOKTITLE = {ACM SIGMOD},
  YEAR = {2015},
  DOCUMENTURL = {http://dl.acm.org/citation.cfm?id=2750548},
  ABSTRACT = {Even with the recent developments in Web search of answering queries from structured data, search engines are still limited to queries with an objective answer, such as EUROPEAN CAPITALS or WOODY ALLEN MOVIES. However, many queries are subjective, such as SAFE CITIES, or CUTE ANIMALS. The underlying knowledge bases of search engines do not contain answers to these queries because they do not have a ground truth. We describe the Surveyor system that mines the dominant opinion held by authors of Web content about whether a subjective property applies to a given entity. The evidence on which SURVEYOR relies is statements extracted from Web text that either support the property or claim its negation. The key challenge that SURVEYOR faces is that simply counting the number of positive and negative statements does not suffice, because there are multiple hidden biases with which content tends to be authored on the Web. SURVEYOR employs a probabilistic model of how content is authored on the Web. As one example, this model accounts for correlations between the subjective property and the frequency with which it is mentioned on the Web. The parameters of the model are specialized to each property and entity type.
Surveyor was able to process a large Web snapshot within a few hours, resulting in opinions for over 4~billion entity-property combinations. We selected a subset of 500 entity-property combinations and compared our results to the dominant opinion of a large number of Amazon Mechanical Turk (AMT) workers. The predictions of Surveyor match the results from AMT in 77\% of all cases (and 87\% for test cases where inter-worker agreement is high), significantly outperforming competing approaches.}
}


@INPROCEEDINGS{sara14,
  AUTHOR = {Sunita Sarawagi and Soumen Chakrabarti},
  TITLE = {Open-domain quantity queries on Web tables: Annotation, response, and consensus models},
  BOOKTITLE = {ACM SIGKDD},
  YEAR = 2014,
  DOCUMENTURL = {https://pdfs.semanticscholar.org/bc51/700cfae5ad693530331875b04ab4b62fc03e.pdf},
  ABSTRACT = {Over 40\% of columns in hundreds of millions of Web tables contain numeric quantities. Tables are a richer source of structured knowledge than free text. We harness Web tables to answer queries whose target is a quantity with natural variation, such as net worth of zuckerburg, battery life of ipad, half life of plutonium, and calories in pizza. Our goal is to respond to such queries with a ranked list of quantity distributions, suitably represented. Apart from the challenges of informal schema and noisy extractions, which have been known since tables were used for non-quantity information extraction, we face additional problems of noisy number formats, as well as unit specifications that are often contextual and ambiguous.

Early hardening of extraction decisions at a table level leads to poor accuracy. Instead, we use a probabilistic context free grammar (PCFG) based unit extractor on the tables, and retain several top-scoring extractions of quantity and numerals. Then we inject these into a new collective inference framework that makes global decisions about the relevance of candidate table snippets, the interpretation of the query's target quantity type, the value distributions to be ranked and presented, and the degree of consensus that can be built to support the proposed quantity distributions. Experiments with over 25 million Web tables and 350 diverse queries show robust, large benefits from our quantity catalog, unit extractor, and collective inference.}
}


@INPROCEEDINGS{iyer14,
  AUTHOR = {Arun Iyer and Saketh Nath and Sunita Sarawagi},
  TITLE = {Maximum Mean Discrepancy for Class Ratio Estimation: Convergence Bounds and Kernel Selection},
  BOOKTITLE = {ICML},
  YEAR = 2014,
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/icml14.pdf},
  ABSTRACT = {In recent times, many real world applications have emerged that
require estimates of class ratios in an unlabeled instance collection
as opposed to labels of individual instances in the collection.  In
this paper we investigate the use of maximum mean discrepancy (MMD) in
a reproducing kernel Hilbert space (RKHS) for estimating such ratios.
First, we theoretically analyze the MMD-based estimates. Our analysis
establishes that, under some mild conditions, the estimate is
statistically consistent. More importantly, it provides an upper bound
on the error in the estimate in terms of intuitive geometric
quantities like class separation and data spread.  Next, we use the
insights obtained from the theoretical analysis, to propose a novel
convex formulation that automatically learns the kernel to be employed
in the MMD-based estimation. We design an efficient cutting plane
algorithm for solving this formulation.  Finally, we empirically
compare our estimator with several existing methods, and show
significantly improved performance under varying datasets, class
ratios, and training sizes.}
}


@INPROCEEDINGS{chaudhari14,
  AUTHOR = {Gaurish Chaudhari and Vashist Avadhanula and Sunita Sarawagi},
  TITLE = {A Few Good Predictions: Selective Node Labeling in a Social Network},
  BOOKTITLE = {WSDM},
  YEAR = 2014,
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/wsdm14.pdf},
  ABSTRACT = {Many social network applications face the following problem: given a
network $G=(V,E)$ with labels on a small subset $\obs \subset V$ of
nodes and an optional set of features on nodes and edges, predict the
labels of the remaining nodes.  Much research has gone into designing
learning models and inference algorithms for accurate predictions in
this setting.  However, a core hurdle to any prediction effort is that
for many nodes there is insufficient evidence for inferring a label.

We propose that instead of focusing on the impossible task of
providing high accuracy over all nodes, we should focus on {\em
  selectively} making the few node predictions which will be correct
with high probability.  Any selective prediction strategy will require
that the scores attached to node predictions be well-calibrated.  Our
evaluations show that existing prediction algorithms are poorly
calibrated.  We propose a new method of training a graphical model
using a conditional likelihood objective that provides better
calibration than the existing joint likelihood objective.  We augment
it with a decoupled confidence model created using a novel unbiased
training process. Empirical evaluation on two large social networks
show that we are able to select a large number of predictions with
accuracy as high as 95\%, even when the best overall accuracy is only
40\%.}
}


@INPROCEEDINGS{sarawagi12,
  AUTHOR = {Namit Katariya and Arun Iyer and Sunita Sarawagi},
  TITLE = {Active Evaluation of Classifiers on Large Datasets},
  BOOKTITLE = {ICDM ({\bf Runners-up for Best paper award})},
  YEAR = 2012,
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/icdm2012.pdf},
  ABSTRACT = {The goal of this work is to estimate the accuracy of a classifier on a
large unlabeled dataset based on a small labeled set and a human
labeler.
We seek
to estimate accuracy and select instances for labeling in a loop via a
continuously refined stratified sampling strategy on the data.
%
We present a novel algorithm for learning $r$ bit hash functions to
stratify data so as to bring together instances with similar accuracy
values.  We show that our algorithm of learning hash functions
provides much better accuracy estimates than most existing algorithms.
%
A major challenge that we address in this paper is how to perform
stratified sampling on unlabeled data that is so large that in an
interactive setting we cannot afford to make one sequential scan to
find the instances in each stratum.  We present an optimal algorithm
for performing importance sampling on a static index over the data.
Experiments on a varied set of real-life dataset of up to 50 million
instances show that we are able to perform close to optimal stratified
sampling by reading three orders of magnitude less data.
}
}


@INPROCEEDINGS{pimplikar12,
  AUTHOR = {Rakesh Pimplikar and Sunita Sarawagi},
  TITLE = {Answering Table Queries on the Web using Column Keywords},
  BOOKTITLE = {Proc. of the 38th Int'l Conference on Very Large Databases (VLDB)},
  YEAR = 2012,
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/vldb2012.pdf},
  ABSTRACT = {We present the design of a structured search engine which returns a
multi-column table in response to a query consisting of keywords
describing each of its columns.  We answer such queries by exploiting
the millions of tables on the Web because these are much richer
sources of structured knowledge than free-format text.
However, a corpus of tables harvested from arbitrary HTML webpages
presents huge challenges of diversity and redundancy not seen in
centrally edited knowledge bases.
We concentrate on one concrete task in this paper.  Given a set of Web
tables $T_1,\ldots,T_n$, and a query $Q$ with $q$ sets of keywords
$Q_1,\ldots,Q_q$, decide for each $T_i$ if it is relevant to $Q$ and
if so, identify the mapping between the columns of $T_i$ and query
columns.  
We represent this task as a graphical model that jointly maps all
tables by incorporating diverse sources of clues spanning matches in
different parts of the table, corpus-wide co-occurrence statistics,
and content overlap across table columns.  We define a novel query
segmentation model for matching keywords to table columns, and a robust
mechanism of exploiting content overlap across table columns.  We
design efficient inference algorithms based on bipartite matching and
constrained graph cuts to solve the joint labeling task.  Experiments
on a workload of 59 queries over a 25 million web table corpus shows
significant boost in accuracy over baseline IR methods.
}
}


@INPROCEEDINGS{gupta11,
  AUTHOR = {Rahul Gupta and Sunita Sarawagi},
  TITLE = {Joint Training for Open-domain Extraction on the Web: Exploiting Overlap when Supervision is Limited},
  BOOKTITLE = {WSDM},
  YEAR = 2011,
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/wsdm11.pdf},
  ABSTRACT = {
We consider the problem of jointly training structured mod-
els for extraction from multiple web sources whose records
enjoy partial content overlap. This has important applica-
tions in open-domain extraction, e.g. a user materializing a
table of interest from multiple relevant unstructured sources;
or a site like Freebase augmenting an incomplete relation by
extracting more rows from web sources. Such applications
require extraction over arbitrary domains, so one cannot use
a pre-trained extractor or demand a huge labeled dataset.
We propose to overcome this lack of supervision by using
content overlap across the related web sources. Existing
methods of exploiting overlap have been developed under
settings that do not generalize easily to the scale and diver-
sity of overlap seen on Web sources.
   We present an agreement-based learning framework that
jointly trains the models by biasing them to agree on the
agreement regions, i.e. shared text segments. We present
alternatives within our framework to trade-off tractability,
robustness to noise, and extent of agreement enforced; and
propose a scheme of partitioning agreement regions that
leads to efficient training while maximizing overall accuracy.
Further, we present a principled scheme to discover low-noise
agreement regions in unlabeled data across multiple sources.
   Through extensive experiments over 58 different extrac-
tion domains, we establish that our framework provides sig-
nificant boosts over uncoupled training, and scores over al-
ternatives such as collective inference, staged training, and
multi-view learning.
}
}


@ARTICLE{gupta10,
  AUTHOR = {Rahul Gupta and Sunita Sarawagi and Ajit A. Diwan},
  TITLE = {Collective Inference for Extraction MRFs Coupled with
              Symmetric Clique Potentials},
  JOURNAL = {JMLR},
  YEAR = 2010,
  VOLUME = 11,
  MONTH = NOV,
  DOCUMENTURL = {http://www.jmlr.org/papers/volume11/gupta10a/gupta10a.pdf}
}


@INPROCEEDINGS{reddi10,
  AUTHOR = {Sashank J. Reddi and Sunita Sarawagi and Sundar Vishwanathan},
  TITLE = {{MAP} estimation in Binary {MRF}s via Bipartite Multi-cuts},
  BOOKTITLE = {NIPS (Oral presentation, {\bf Honorary mention for Outstanding student paper award})},
  YEAR = 2010,
  DOCUMENTURL = {http://books.nips.cc/papers/files/nips23/NIPS2010_0411.pdf},
  ABSTRACT = {We propose a new LP relaxation for obtaining the MAP assignment of a
binary MRF with pairwise potentials.  Our relaxation is derived from
reducing the MAP assignment problem to an instance of a recently
proposed Bipartite Multi-cut problem where the LP relaxation is
guaranteed to provide an $O(\log k)$ approximation where $k$ is the
number of vertices adjacent to non-submodular edges in the MRF.  We
then propose a combinatorial algorithm to efficiently solve the LP and
also provide a lower bound by concurrently solving its dual to within
an $\epsilon$ approximation.  The algorithm is up to an order of
magnitude faster and provides better MAP scores and bounds than the
state of the art message passing algorithm of Sontag et al 2008 that
tightens the local marginal polytope with third-order marginal
constraints.}
}


@INPROCEEDINGS{limaye10,
  AUTHOR = {Girija Limaye and Sunita Sarawagi and Soumen Chakrabarti},
  TITLE = {Annotating and Searching Web Tables Using Entities, Types and Relationships},
  BOOKTITLE = {VLDB},
  YEAR = 2010,
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/vldb2010.pdf},
  ABSTRACT = {Tables are a universal idiom to express relational data, even for
human consumption.  Billions of tables on Web pages express
entity references, attributes and relationships.
%
This representation of relational world knowledge
is usually considerably better than completely unstructured
free-format text.  At the same time, unlike manually-created knowledge
bases, relational information mined from ``organic'' Web tables need
not be constrained by availability of precious editorial time.  
%
Unfortunately, in the absence of 
any formal, uniform schema imposed on Web tables, Web
search cannot take advantage of these high-quality sources of
relational information.
%
In this paper we propose new
machine learning techniques to annotate table cells with entities
that they likely mention, table columns with
types from which entities are drawn for cells in the column, and
relations that pairs of table columns are seeking to express.
%
We propose a new graphical model for making all these labeling
decisions for each table simultaneously, rather than make separate
local decisions for entities, types and relations.
%
Experiments using the YAGO catalog, DBPedia, tables from Wikipedia,
and over 25 million HTML tables from a 500 million page Web crawl
uniformly show the superiority of our approach.
%
We also evaluate the impact of better annotations on a prototype
relational Web search tool.  We demonstrate clear benefits of our
annotations beyond indexing tables in a purely textual manner.
}
}


@ARTICLE{ChakrabartiSS10,
  AUTHOR = {Soumen Chakrabarti and
               Sunita Sarawagi and
               S. Sudarshan},
  TITLE = {Enhancing Search with Structure},
  JOURNAL = {IEEE Data Eng. Bull.},
  VOLUME = {33},
  NUMBER = {1},
  YEAR = {2010},
  PAGES = {3-24},
  DOCUMENTURL = {http://sites.computer.org/debull/A10mar/soumen-paper.pdf},
  ABSTRACT = {Keyword search has traditionally focussed on retrieving documents in ranked order, given simple key-
word queries. Similarly, work on keyword queries on structured data has focussed on retrieving closely
connected pieces of data that together contain given query keywords. In recent years, there has been
a good deal of work that attempts to go beyond the above paradigms, to improve search experience
on unstructured textual data as well as on structured or semi-structured data. In this paper, we survey
recent work on adding structure to keyword search, which can be categorized on three axes: (a) adding
structure to unstructured data, (b) adding structure to answers, and (c) adding structure to queries al-
lowing more power than simple keyword queries, but while avoiding the complexity of elaborate query
languages that demand extensive schema knowledge.}
}


@INPROCEEDINGS{gupta09,
  AUTHOR = {Rahul Gupta and Sunita Sarawagi},
  TITLE = {Answering Table Augmentation Queries from Unstructured Lists on the Web},
  BOOKTITLE = {PVLDB},
  YEAR = 2009,
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/vldb09.pdf},
  ABSTRACT = {We present the design of a system for assembling a table
from a few example rows by harnessing the huge corpus of information-rich
but unstructured lists on the web.
We developed a totally unsupervised end to end approach which given the
sample query rows ---
%
(a) retrieves HTML lists relevant to the query from a pre-indexed crawl of web lists,
(b) segments the list records and maps the segments to the query schema using a statistical model, 
(c) consolidates the results from multiple lists into a unified merged table,
(d) and presents to the user the consolidated records ranked
by their estimated membership in the target relation.

The key challenges in this task include construction of new rows from
very few examples, and an abundance of noisy and irrelevant lists that
swamp the consolidation and ranking of rows.  We propose modifications
to statistical record segmentation models, and present novel
consolidation and ranking techniques that can process input tables of
arbitrary schema without requiring any human supervision.
Experiments with Wikipedia target tables and 16 million
unstructured lists show that even with just three sample rows, our system is very effective at recreating Wikipedia tables, 
with a mean runtime of around 20s.}
}


@INPROCEEDINGS{sarawagi09,
  AUTHOR = {Sunita Sarawagi and Vinay S Deshpande and Sourabh Kasliwal},
  TITLE = {Efficient Top-K count queries over imprecise duplicates},
  BOOKTITLE = {EDBT},
  YEAR = 2009,
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/edbt09.pdf},
  ABSTRACT = {We propose efficient techniques for processing various Top-K count
queries on data with duplicates. Our method differs from existing work
on duplicate elimination in two significant ways: First, we dedup on
the fly only the part of the data actually needed for the answer --- a
requirement in massive and rapidly evolving sources where batch
deduplication is not feasible.  Given the non-local nature of the
problem of partitioning data into duplicate groups, it is challenging
to perform early filtering of tuples relevant to the $K$ largest
groups.  We propose a novel method of successively collapsing and
pruning records which yield an order of magnitude reduction in running
time compared to deduplicating the entire data first.

Second, we return multiple high scoring answers to handle situations
where it is impossible to resolve if two records are indeed duplicates
of each other.  Since finding even the highest scoring deduplication
is NP-hard, the existing approach is to deploy one of many variants of
score-based clustering algorithms which do not easily generalize to
return multiple groupings. We model deduplication as a segmentation of
a linear embedding of records and present a polynomial time algorithm
to find the $R$ highest scoring answers. This method closely matches
the accuracy of an exact exponential time algorithm on several
datasets.}
}


@MISC{koudas06,
  AUTHOR = {Nick Koudas and Sunita Sarawagi and Divesh Srivastava},
  TITLE = {Record linkage: Similarity measures and algorithms},
  HOWPUBLISHED = {Tutorial at SIGMOD},
  YEAR = 2006
}


@INPROCEEDINGS{gupta08,
  AUTHOR = {Rahul Gupta and Sunita Sarawagi},
  TITLE = {Domain Adaptation of Information Extraction Models},
  BOOKTITLE = {Sigmod Record},
  YEAR = 2008,
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/sigmodRecord08.pdf},
  ABSTRACT = {Domain adaptation refers to the process of adapting an extraction model trained in
one domain to another related domain with only unlabeled data.  We
present a brief survey of existing methods of retraining models to
best exploit labeled data from a related domain.  These approaches
that involve expensive model retraining are not practical when a large
number of new domains have to be handled in an operational setting.
We describe our approach for adapting record extraction models that
exploits the regularity within a domain to jointly label records
without retraining any model.}
}


@ARTICLE{ieSurvey08,
  AUTHOR = {Sunita Sarawagi},
  TITLE = {Information Extraction},
  JOURNAL = {FnT Databases},
  YEAR = 2008,
  VOLUME = 1,
  NUMBER = 3,
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/ieSurvey.pdf}
}


@INPROCEEDINGS{sarawagi08,
  AUTHOR = {Sunita Sarawagi and Rahul Gupta},
  TITLE = {Accurate max-margin training for structured output spaces},
  BOOKTITLE = {Proceedings of the $\mathit{25}^{th}$ International Conference on Machine Learning (ICML), USA},
  YEAR = 2008,
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/icml08.pdf},
  ABSTRACT = {Tsochantaridis et al 2005 proposed two formulations for maximum
margin training of structured spaces: margin scaling and slack
scaling. While margin scaling has been extensively used since it
requires the same kind of MAP inference as normal structured
prediction, slack scaling is believed to be more accurate and
better-behaved.  We present an efficient variational approximation to
the slack scaling method that solves its inference bottleneck while
retaining its accuracy advantage over margin scaling.

We further argue that existing scaling approaches do not separate the
true labeling comprehensively while generating violating
constraints. We propose a new max-margin trainer PosLearn that
generates violators to ensure separation at each position of a
decomposable loss function.  

Empirical results on real datasets illustrate that PosLearn can reduce
test error by up to 25% over margin scaling and 10\% over slack
scaling.  Further, PosLearn violators can be generated more
efficiently than slack violators; for many structured tasks the time
required is just twice that of MAP inference.}
}


@INPROCEEDINGS{satpal07,
  AUTHOR = {Sandeep Satpal and Sunita Sarawagi},
  TITLE = {Domain Adaptation of Conditional Probability Models via Feature Subsetting},
  BOOKTITLE = {ECML/PKDD},
  YEAR = 2007,
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/pkdd07.pdf},
  ABSTRACT = {The goal in domain adaptation is to train a model using labeled data
sampled from a domain different from the target domain on which the
model will be deployed.  We exploit unlabeled data from the target
domain to train a model that maximizes likelihood over the training
sample while minimizing the distance between the training and target
distribution.  Our focus is conditional probability models used for
predicting a label structure ${\bf y}$ given input ${\bf x}$ based on features
defined jointly over ${\bf x}$ and ${\bf y}$.  We propose practical measures
of divergence between the two domains based on which we penalize
features with large divergence, while improving the effectiveness of
other less deviant correlated features.  Empirical evaluation on
several real-life information extraction tasks using Conditional
Random Fields (CRFs) show that our method of domain adaptation leads
to significant reduction in error.}
}


@INPROCEEDINGS{gupta07,
  AUTHOR = {Rahul Gupta and Ajit A. Diwan and Sunita Sarawagi},
  TITLE = {Efficient Inference with Cardinality-based Clique Potentials},
  BOOKTITLE = {Proceedings of the $\mathit{24}^{th}$ International Conference on Machine Learning (ICML), USA},
  YEAR = 2007,
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/icml07.pdf},
  ABSTRACT = {Many collective labeling tasks require inference on
graphical models where the clique potentials depend only on the number
of nodes that get a particular label. We design efficient inference
algorithms for various families of such potentials.
Our algorithms are exact for arbitrary cardinality-based clique
potentials on binary labels and for max-like and majority-like clique
potentials on multiple labels.  Moving towards more complex
potentials, we show that inference becomes NP-hard even on cliques
with homogeneous Potts potentials. We present a
$\frac{13}{15}$-approximation algorithm with runtime sub-quadratic in
the clique size. In contrast, the best known previous guarantee for
graphs with Potts potentials is only $0.5$.
We perform empirical comparisons on real and synthetic data, and show that our
proposed methods are an order of magnitude faster than the well-known
Tree-based re-parameterization (TRW) and graph-cut algorithms.}
}


@INPROCEEDINGS{gupta06,
  AUTHOR = {Rahul Gupta and Sunita Sarawagi},
  TITLE = {Curating probabilistic databases from information extraction models},
  BOOKTITLE = {VLDB},
  YEAR = 2006,
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/vldb06.pdf},
  ABSTRACT = {Many real-life applications depend on databases automatically curated
from unstructured sources through imperfect structure extraction
tools.  Such databases are best treated as imprecise representations
of multiple extraction possibilities.  State-of-the-art statistical
models of extraction provide a sound probability distribution over
extractions but are not easy to represent and query in a relational
framework.  In this paper we address the challenge of approximating
such distributions as imprecise data models.  In particular, we
investigate a model that captures both row-level and column-level
uncertainty and show that this representation provides significantly
better approximation compared to models that use only row or only
column level uncertainty.  We present efficient algorithms for finding
the best approximating parameters for such a model; our algorithm
exploits the structure of the model to avoid enumerating the
exponential number of extraction possibilities.}
}


@INPROCEEDINGS{sarawagi06,
  AUTHOR = {Sunita Sarawagi},
  TITLE = {Efficient inference on sequence segmentation models},
  BOOKTITLE = {Proceedings of the $\mathit{23}^{rd}$ International Conference on Machine Learning (ICML), Pittsburgh, PA, USA},
  YEAR = 2006,
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/icml06.pdf},
  ABSTRACT = {Sequence segmentation is a flexible and highly accurate mechanism for
modeling several applications.  Inference on segmentation models
involves dynamic programming computations that in the worst case can
be cubic in the length of a sequence.  In contrast, typical sequence
labeling models require linear time.  We remove this limitation of
segmentation models vis-a-vis sequential models by designing a
succinct representation of potentials common across overlapping
segments.  We exploit such potentials to design efficient inference
algorithms that are both analytically shown to have a lower complexity
and empirically found to be comparable to sequential models for
typical extraction tasks.}
}


@INPROCEEDINGS{gupta06b,
  AUTHOR = {Rahul Gupta and Sunita Sarawagi},
  TITLE = {MAP estimation in MRFs via rank aggregation},
  BOOKTITLE = {Proc. of the ICML Workshop on Open Problems in Statistical Relational Learning (co-presented with ICML Workshop on Learning in Structured Output Spaces)},
  YEAR = 2006,
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/icml06Wkshp.pdf},
  ABSTRACT = {Efficient estimation of the maximum a priori (MAP) assignment in large
statistical relational networks still remains an open issue in spite
of the extensive research in this area.
We propose a novel method of exploiting top-$K$ MAP estimates from
simpler subgraphs to find an assignment that is either MAP optimal, or
has an associated bound on how far it is from the optimal.  Our method
extends the well-known tree reweighted max-product algorithm (TRW) and
is guaranteed to always provide tighter upper bounds.
Experiments on synthetic and real data show that we are able to the
find the optimal in many more cases  than TRW, at significantly fewer
iterations and our bounds are much tighter than those provided by TRW.}
}


@INPROCEEDINGS{mansuri05,
  AUTHOR = {Imran Mansuri and Sunita Sarawagi},
  TITLE = {A system for integrating unstructured data into relational databases},
  BOOKTITLE = {Proc. of the 22nd IEEE Int'l Conference on Data Engineering (ICDE)},
  YEAR = 2006,
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/icde06a.pdf},
  ABSTRACT = {In this paper we present a system for automatically integrating
unstructured text into a multi-relational database using
state-of-the-art statistical models for structure extraction and
matching.  
We show how to extend current high-performing models, Conditional
Random Fields and their semi-markov counterparts, to effectively
exploit a variety of recognition clues available in a database of
entities, thereby significantly reducing the dependence on manually
labeled training data.
Our system is designed to load unstructured records into columns
spread across multiple tables in the database while resolving the
relationship of the extracted text with existing column values, and
preserving the cardinality and link constraints of the database.
We show how to combine the inference algorithms of statistical models
with the database imposed constraints for optimal data integration.
}
}


@INPROCEEDINGS{chandel05,
  AUTHOR = {Amit Chandel and P.C. Nagesh and Sunita Sarawagi},
  TITLE = {Efficient Batch Top-k Search for Dictionary-based Entity Recognition},
  BOOKTITLE = {ICDE},
  YEAR = 2006,
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/icde06b.pdf},
  ABSTRACT = {We consider the problem of speeding up Entity Recognition systems that
exploit existing large databases of structured entities to improve
extraction accuracy.  These systems require the computation of the
maximum similarity scores of several overlapping segments of the input
text with the entity database.  We formulate a batch Top-K problem
with the goal of sharing computations across overlapping segments.
Our proposed algorithm performs a factor of three faster than
independent top-K queries and only a factor of two slower than an
unachievable lower bound on total cost.  We then propose a novel
modification of the popular Viterbi algorithm for recognizing entities
so as to work with easily computable bounds on match scores, thereby
reducing the total inference time by a factor of eight compared to
state-of-the-art methods.}
}


@MISC{crfsf,
  TITLE = {The CRF project: a java implementation},
  AUTHOR = {Sunita Sarawagi},
  HOWPUBLISHED = {{\tt http://crf.sourceforge.net}},
  YEAR = 2004
}


@INPROCEEDINGS{sarawagi04b,
  AUTHOR = {S. Sarawagi},
  TITLE = {Models and Indices for Integrating Unstructured Data with a Relational Database},
  BOOKTITLE = {KDID},
  PAGES = {1--10},
  YEAR = 2004,
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/kdid.pdf},
  ABSTRACT = {{Database systems are islands of structure in a sea of unstructured
data sources. Several real-world applications now need to create
bridges for smooth integration of semi-structured sources with existing
structured databases for seamless querying. This integration requires
extracting structured column values from the unstructured source and
mapping them to known database entities. Existing methods of data integration
do not effectively exploit the wealth of information available in
multi-relational entities.
We present statistical models for co-reference resolution and information
extraction in a database setting. We then go over the performance
challenges of training and applying these models efficiently over very
large databases. This requires us to break open a black box statistical
model and extract predicates over indexable attributes of the database.
We show how to extract such predicates for several classification models,
including naive Bayes classifiers and support vector machines. We extend
these indexing methods for supporting similarity predicates needed
during data integration.}}
}


@INCOLLECTION{sarawagi05,
  AUTHOR = {S. Sarawagi},
  TITLE = {Sequence Data mining},
  BOOKTITLE = {Advanced Methods for Knowledge Discovery from Complex Data},
  PUBLISHER = {Springer Verlag},
  YEAR = 2005
}


@INPROCEEDINGS{godbole05,
  AUTHOR = {Shantanu Godbole and Ganesh Ramakrishnan and Sunita Sarawagi},
  TITLE = {Text Classification with evolving label-sets},
  BOOKTITLE = {ICDM},
  YEAR = 2005
}


@INPROCEEDINGS{vs05,
  AUTHOR = {V.G.Vinod Vydiswaran and Sunita Sarawagi},
  TITLE = {Learning to extract information from large websites using sequential models},
  BOOKTITLE = {COMAD},
  NOTE = {(Best paper award)},
  YEAR = 2005
}


@INPROCEEDINGS{sarawagi04,
  AUTHOR = {Sunita Sarawagi and William W. Cohen},
  TITLE = {Semi-Markov Conditional Random Fields for Information Extraction},
  BOOKTITLE = {NIPS},
  YEAR = {2004},
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/nips04.pdf},
  ABSTRACT = {We describe semi-Markov conditional random fields (semi-CRFs), a
conditionally trained version of semi-Markov chains.  Intuitively, a
semi-CRF on an input sequence ${\bf x}$ outputs a ``segmentation'' of
${\bf x}$, in which labels are assigned to segments ({\it i.e.},
subsequences) of ${\bf x}$ rather than to individual elements $x_i$ of
${\bf x}$.  Importantly, features for semi-CRFs can measure properties of
segments, and transitions within a segment can be non-Markovian.  In
spite of this additional power, exact learning and inference
algorithms for semi-CRFs are polynomial-time---often only a small
constant factor slower than conventional CRFs.  In experiments on five
named entity recognition problems, semi-CRFs generally outperform
conventional CRFs.}
}


@INPROCEEDINGS{godbole04,
  AUTHOR = {Shantanu Godbole and Sunita Sarawagi},
  TITLE = {Discriminative methods for multi-labeled classification},
  BOOKTITLE = {PAKDD},
  YEAR = 2004
}


@INPROCEEDINGS{harpale04,
  AUTHOR = {Abhay Harpale and Shantanu Godbole and Sunita Sarawagi and Soumen Chakrabarti},
  TITLE = {Document classification through interactive supervision of document and term labels},
  BOOKTITLE = {ECML/PKDD},
  YEAR = 2004
}


@INPROCEEDINGS{CohenKDD2004,
  AUTHOR = {William W. Cohen and Sunita Sarawagi},
  TITLE = {Exploiting Dictionaries in Named Entity Extraction: Combining 
          Semi-Markov Extraction Processes and Data Integration Methods},
  BOOKTITLE = {Proceedings of the Tenth ACM SIGKDD International Conference
               on Knowledge Discovery and Data Mining, Seattle, USA},
  YEAR = {2004},
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/kdd04.pdf},
  ABSTRACT = {We consider the problem of improving named entity recognition (NER)
systems by using external dictionaries---more specifically, the
problem of extending state-of-the-art NER systems by incorporating
information about the similarity of extracted entities to entities in
an external dictionary.  This is difficult because most
high-performance named entity recognition systems operate by
sequentially classifying {words\/} as to whether or not they
participate in an entity name; however, the most useful similarity
measures score {entire candidate names\/}.  To correct this mismatch
we formalize a semi-Markov extraction process, which is based on
sequentially classifying {\em segments\/} of several adjacent words,
rather than single words.  In addition to allowing a natural way of
coupling high-performance NER methods and high-performance similarity
functions, this formalism also allows the direct use of other useful
entity-level features, and provides a more natural formulation of the
NER problem than sequential word classification.  Experiments in
multiple domains show that the new model can substantially improve
extraction performance over previous methods for using external
dictionaries in NER.
}
}


@INPROCEEDINGS{sara04,
  AUTHOR = {Sunita Sarawagi and Alok Kirpal},
  TITLE = {Efficient set joins on similarity predicates},
  BOOKTITLE = {Proceedings of the  ACM SIGMOD International Conference
               on Management of Data},
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/sigmod04.pdf},
  YEAR = {2004},
  ABSTRACT = {In this paper we present an efficient, scalable and general algorithm for
performing set joins on predicates involving various similarity
measures like intersect size, Jaccard-coefficient, cosine similarity,
and edit-distance.  This expands the existing suite of algorithms for
set joins on simpler predicates such as, set containment, equality and
non-zero overlap.  We start with a basic inverted index based probing
method and add a sequence of optimizations that result in one to two
orders of magnitude improvement in running time.  The algorithm folds
in a data partitioning strategy that can work efficiently with an
index compressed to fit in any available amount of main memory.  The
optimizations used in our algorithm  generalize  to
several weighted and unweighted measures of partial word overlap
between sets.}
}


@MISC{sc03,
  AUTHOR = {Sunita Sarawagi and Mark Craven},
  TITLE = {Sequence Data mining},
  YEAR = 2003,
  HOWPUBLISHED = {Tutorial at the ACM Intl' conf on Knowledge Discovery in Databases (SIGKDD)}
}


@INPROCEEDINGS{scg02,
  AUTHOR = {Sunita Sarawagi and Soumen Chakrabarti and Shantanu Godbole},
  TITLE = {Cross training: learning probabilistic mappings between topics},
  BOOKTITLE = {Proc. of the Nineth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining(KDD-2003)},
  ADDRESS = {Washington D.C., USA},
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/kdd03.pdf},
  MONTH = {August},
  YEAR = 2003,
  ABSTRACT = {
Classification is a well-established operation in text mining.  Given
a set of labels $A$ and a set $D_A$ of training documents tagged with
these labels, a classifier learns to assign labels to unlabeled test
documents.  Suppose we also had available a different set of labels
$B$, together with a set of documents $D_B$ marked with labels
from~$B$.  If $A$ and $B$ have some semantic overlap, can the
availability of $D_B$ help us build a better classifier for $A$, and
vice versa?  We answer this question in the affirmative by 
proposing \emph{cross-training}: a new approach to semi-supervised
learning in presence of multiple label sets.  We give distributional
and discriminative algorithms for cross-training and show, through
extensive experiments, that cross-training
can discover and exploit probabilistic relations between two
taxonomies for more accurate classification.
}
}


@ARTICLE{ChaudhuriNS04,
  AUTHOR = {Surajit Chaudhuri and
               Vivek R. Narasayya and
               Sunita Sarawagi},
  TITLE = {Extracting predicates from mining models for efficient query
               evaluation},
  JOURNAL = {ACM Trans. Database Syst.},
  VOLUME = {29},
  NUMBER = {3},
  YEAR = {2004},
  PAGES = {508-544}
}


@INPROCEEDINGS{cgs03,
  AUTHOR = {Surajit Chaudhuri and Prasanna Ganesan and Sunita Sarawagi},
  TITLE = {Factorizing Complex Predicates in Queries to Exploit Indexes},
  ADDRESS = {{San Diego, USA}},
  BOOKTITLE = {{Proc. ACM SIGMOD International Conf. on Management of Data}},
  MONTH = {June},
  YEAR = 2003,
  ABSTRACT = {
Decision-support applications generate queries with complex predicates. We show how  the {\em factorization} of complex query expressions exposes significant opportunities for exploiting  available indexes. We  also  present a novel idea of relaxing predicates in a complex condition to create possibilities for factoring. Our algorithms are designed for easy integration with existing query optimizers and support multiple optimization levels, providing different trade-offs between plan complexity and optimization time. 
}
}


@INPROCEEDINGS{sk03,
  AUTHOR = {Sunita Sarawagi and Alok Kirpal},
  TITLE = {Scaling up the ALIAS Duplicate Elimination System: A Demostration},
  BOOKTITLE = {Proc. of the 19th IEEE Int'l Conference on Data Engineering (ICDE)},
  ADDRESS = {Bangalore},
  MONTH = {March},
  YEAR = 2003,
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/icde03demo.pdf},
  ABSTRACT = {
Duplicate elimination is an important step in integrating data from
multiple sources. The challenges involved are finding a robust
deduplication function that can identify when two records are
duplicates and efficiently applying the function on very large lists
of records.  In ALIAS the task of designing a deduplication function
is eased by learning the function from examples of duplicates and
non-duplicates and by using active learning to spot such examples
effectively~\cite{sb02}.  Here we investigate the issues involved in
efficiently applying the learnt deduplication system on large lists of
records. We demonstrate the working of the ALIAS evaluation engine and
highlight the optimizations it uses to significantly cut down the
number of record pairs that need to be explicitly materialized.
}
}


@ARTICLE{Wangikar2003,
  AUTHOR = {PP Wangikar and AV Tendulkar and S Ramya and DN Mali and S Sarawagi},
  TITLE = {Functional Sites in Protein Families Uncovered via an Objective and Automated Graph Theoretic Approach},
  JOURNAL = {Journal of molecular biology},
  YEAR = 2003,
  VOLUME = 326,
  NUMBER = 3,
  PAGES = {955-978}
}


@INPROCEEDINGS{sb02,
  AUTHOR = {Sunita Sarawagi and Anuradha Bhamidipaty},
  TITLE = {Interactive deduplication using active learning},
  YEAR = 2002,
  BOOKTITLE = {SIGKDD},
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/kdd02.pdf},
  ABSTRACT = {Deduplication is a key operation in integrating data from multiple
sources.  The main challenge in this task is designing a function that
can resolve when a pair of records refer to the same entity in spite
of various data inconsistencies.  Most existing systems use hand-coded
functions.  One way to overcome the tedium of hand-coding is to train
a classifier to distinguish between duplicates and non-duplicates.
The success of this method critically hinges on being able to provide
a {\em covering and challenging} set of training pairs that bring out
the subtlety of the deduplication function.  
This is non-trivial because it requires manually searching for various
data inconsistencies between any two records spread apart in large lists.
We present our design of a learning-based deduplication system that
uses a novel method of interactively discovering challenging training
pairs using {\em active learning}.  Our experiments on real-life
datasets show that active learning significantly reduces the number of
instances needed to achieve high accuracy.  We investigate various
design issues that arise in building a  system to provide
interactive response, fast convergence, and interpretable output.
}
}


@INPROCEEDINGS{gsc02,
  AUTHOR = {Shantanu Godbole and Sunita Sarawagi and Soumen Chakrabarti},
  TITLE = {Scaling multi-class Support Vector Machines using inter-class confusion},
  MONTH = {July},
  YEAR = 2002,
  BOOKTITLE = {Poster paper in the Proc. of 
the Eighth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining(KDD-2002)},
  ADDRESS = {Edmonton, Canada},
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/kdd02poster.pdf},
  ABSTRACT = {Support vector machines (SVMs) excel at two-class discriminative
learning problems. They often outperform generative
classifiers, especially those that use inaccurate generative
models, such as the naive Bayes (NB) classifier. On the
other hand, generative classifiers have no trouble in handling
an arbitrary number of classes efficiently, and NB classifiers
train much faster than SVMs owing to their extreme simplicity.
In contrast, SVMs handle multi-class problems by
learning redundant yes/no (one-vs-others) classifiers for each
class, further worsening the performance gap. We propose
a new technique for multi-way classification which exploits
the accuracy of SVMs and the speed of NB classifiers. We
first use a NB classifier to quickly compute a confusion matrix,
which is used to reduce the number and complexity of
the two-class SVMs that are built in the second stage. During
testing, we first get the prediction of a NB classifier and
use that to selectively apply only a subset of the two-class
SVMs. On standard benchmarks, our algorithm is 3 to 6
times faster than SVMs and yet matches or even exceeds
their accuracy.
}
}


@INPROCEEDINGS{sbkm02,
  AUTHOR = {Sunita Sarawagi and Anuradha Bhamidipaty and Alok Kirpal and  Chandra Mouli},
  TITLE = {ALIAS: An Active Learning led Interactive  Deduplication System},
  BOOKTITLE = {{Proc. of the 28th Int'l Conference on Very Large Databases (VLDB) (Demonstration session)}},
  ADDRESS = {Hongkong},
  MONTH = {August},
  YEAR = 2002,
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/vldb02demo.pdf},
  ABSTRACT = {Deduplication, a key operation in integrating data from multiple
sources, is a time-consuming, labor-intensive and domain-specific
operation.  We present our design of ALIAS that uses a novel
approach to ease this task by limiting the manual effort to inputing
simple, domain-specific attribute similarity functions and
interactively labeling a small number of record pairs.  We describe
how active learning is useful in selecting informative examples of
duplicates and non-duplicates that can be used to train a
deduplication function.  ALIAS provides mechanism for efficiently
applying the function on large lists of records using a novel
cluster-based execution model.
}
}


@INPROCEEDINGS{ajsh02,
  AUTHOR = {B. Anurandha and Anand Janakiraman and Sunita Sarawagi and Jayant Haritsa},
  TITLE = {Building Classifiers With Unrepresentative Training Instances:
        Experiences From The KDD Cup 2001 Competition},
  BOOKTITLE = {Proc. of Workshop on Data Mining Lessons Learnt  held in conjunction with
the International Conference on Machine Learning, Sydney},
  MONTH = {July},
  YEAR = 2002,
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/dmll02.pdf},
  ABSTRACT = {In this paper we discuss our experiences in participating in the KDD
Cup 2001 competition. The task involved classifying organic molecules
as either active or inactive in their binding to a receptor.  The
classification task presented three challenges: highly skewed class
distribution, large number of features exceeding training set size by
two orders of magnitude, and non-representative training instances.
Of these, we found the third challenge the most interesting and novel.
We present our process of experimenting with a number of
classification methods before finally converging on an ensemble of
decision trees constructed using a novel attribute partitioning
method.  Decision trees provided partial shield from the differences
in data distribution and the ensemble provided stability by exploiting
the redundancy in the large set of features.  Finally, we employed
semi-supervised learning to incorporate characteristics of the test
set into the classification model.
We were second-runner's up in the competition.  
We followed up the competition with further research in
semi-supervised learning and obtained an accuracy higher than that of
the winning entry.
}
}


@INPROCEEDINGS{cns02,
  AUTHOR = {Surajit Chaudhuri and Vivek Narasayya and Sunita Sarawagi},
  TITLE = {Efficient Evaluation of Queries with Mining Predicates},
  BOOKTITLE = {Proc. of the 18th IEEE Int'l Conference on Data Engineering (ICDE)},
  YEAR = 2002,
  ADDRESS = {San Jose, USA},
  MONTH = {April},
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/icde02.ps},
  ABSTRACT = {        
Modern relational database systems are beginning to support ad hoc
queries on mining models.  In this paper, we explore novel techniques
for optimizing queries that apply mining models to relational data.
For such queries, we use the internal structure of the mining model to
automatically derive traditional database predicates.  We present
algorithms for deriving such predicates for some popular discrete
mining models: decision trees, naive Bayes, and clustering.  Our
experiments on Microsoft SQL Server 2000 demonstrate that these
derived predicates can significantly reduce the cost of evaluating
such queries.  }
}


@MISC{ss02,
  AUTHOR = {Sunita Sarawagi},
  TITLE = {Automation in Information extraction and data integration},
  YEAR = 2002,
  HOWPUBLISHED = {Tutorial at the Intl' conf on Very Large DataBases 2002,Hongkong.}
}


@INPROCEEDINGS{sara01:relax,
  AUTHOR = {G. Sathe and S. Sarawagi},
  TITLE = {Intelligent Rollups in multidimensional OLAP data},
  BOOKTITLE = {Proc. of the 27th Int'l Conference on Very Large
Databases (VLDB)},
  ADDRESS = {Rome,Italy},
  YEAR = 2001,
  PAGES = {307-316},
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/vldb01.pdf},
  ABSTRACT = {
In this paper we propose a new operator for advanced exploration of
large multidimensional databases.  The proposed operator can
automatically generalize from a specific problem case in detailed data
and return the broadest context in which the problem occurs.
Such a functionality would be useful to an analyst who after observing
a problem case, say a drop in sales for a product in a store, would
like to find the exact scope of the problem.
With existing tools he would have to manually search around the
problem tuple trying to draw a pattern.  This process is both tedious
and imprecise.
Our proposed operator can automate these manual steps and return in a
single step a compact and easy-to-interpret summary of all possible
maximal generalizations along various roll-up paths around the case.
We present a flexible cost-based framework that can generalize various kinds of behaviour (not simply drops) while requiring little additional
customization from the user.  We design an algorithm that can work
efficiently on large multidimensional hierarchical data cubes so as to be usable in an interactive setting.
}
}


@ARTICLE{sigkdd00,
  AUTHOR = {S. Sarawagi and S. Nagaralu},
  TITLE = {Data mining as an intac service},
  JOURNAL = {Sigkdd Explorations},
  YEAR = 2000,
  VOLUME = 2,
  NUMBER = 1,
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/sigkdd.pdf},
  ABSTRACT = {        
The goal of this article is to raise a debate on the usefulness of
providing data mining models as services on the intac.  These
services can be provided by anyone with adequate data and expertise
and made available on the intac for anyone to use.  For instance,
Yahoo or Altavista, given their huge categorized document collection,
can train a document classifier and provide the model as a service on
the intac.  This way data mining can be made accessible to a wider
audience instead of being limited to people with the data and the
expertise.  A host of practical problems need to be solved before this
idea can be made to work.  We identify them and close with an
invitation for further debate and investigation.
  }
}


@INPROCEEDINGS{hmm01,
  AUTHOR = {Vinayak R. Borkar and Kaustubh Deshmukh and Sunita Sarawagi},
  TITLE = {Automatic text segmentation for extracting structured records},
  ADDRESS = {Santa Barabara,USA},
  BOOKTITLE = {Proc. ACM SIGMOD International Conf. on Management of Data},
  YEAR = 2001,
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/sigmod01.pdf},
  ABSTRACT = { 
  In this paper we present a method for automatically segmenting
unformatted text records into structured elements.  Several useful
data sources today are human-generated as continuous text whereas
convenient usage requires the data to be organized as structured
records.  A prime motivation is the warehouse address cleaning problem
of transforming dirty addresses stored in large corporate databases as
a single text field into subfields like ``City'' and ``Street''.
Existing tools rely on hand-tuned, domain-specific rule-based systems.

We describe a tool DATAMOLD that learns to automatically extract
structure when seeded with a small number of training examples.  The
tool enhances on Hidden Markov Models (HMM) to build a powerful
probabilistic model that corroborates multiple sources of information 
including, the sequence of
elements, their length distribution, distinguishing words from the
vocabulary and an optional external data dictionary.  Experiments on
real-life datasets yielded accuracy of 90\% on Asian addresses and
99\% on US addresses.  In contrast, existing information extraction
methods based on rule-learning techniques yielded considerably lower
accuracy.
  }
}


@INPROCEEDINGS{sara00:maxent,
  AUTHOR = {S. Sarawagi},
  TITLE = {User-adaptive exploration of multidimensional data},
  BOOKTITLE = {Proc. of the 26th Int'l Conference on Very Large
Databases (VLDB)},
  ADDRESS = {Cairo,Egypt},
  YEAR = 2000,
  PAGES = {307-316},
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/vldb00.pdf},
  ABSTRACT = {        
In this paper we present a tool for enhanced exploration of OLAP data
that is adaptive to a user's prior knowledge of the data.  The tool
continuously keeps track of the parts of the cube that a user has
visited.
The information in these scattered visited parts of the cube is pieced
together to form a model of the user's expected values in the
unvisited parts.  The mathematical foundation for this modeling is
provided by the classical Maximum Entropy principle. At any time, the
user can query for the most surprising unvisited parts of the cube.
The most surprising values are defined as those which if known to the
user would bring the new expected values closest to the actual values. 
This process of updating the user's context based on visited parts and
querying for regions to explore further continues in a loop until the
user's mental model perfectly matches the actual cube.  We believe and
prove through experiments that such a user-in-the-loop exploration
will enable much faster assimilation of all significant information in
the data compared to existing manual explorations.
  }
}


@ARTICLE{sara01:maxent,
  AUTHOR = {S. Sarawagi},
  TITLE = {User-cognizant multidimensional analysis},
  JOURNAL = {The VLDB Journal},
  YEAR = 2001,
  VOLUME = 10,
  NUMBER = {2-3},
  PAGES = {224-239},
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/vldbjournal.ps},
  ABSTRACT = {        
Our goal is to enhance multidimensional database systems with a suite
of advanced operators to automate data analysis tasks that are
currently handled through manual exploration.  In this paper we
present a key component of our  system that characterizes the
information content of a cell based on a user's prior familiarity with
the cube and provides a context-sensitive exploration of the cube.
There are three main modules of this component. A Tracker, that
continuously tracks the parts of the cube that a user has visited.  A
Modeler, that pieces together the information in the 
visited parts to model the user's expected values in the unvisited
parts.  An Informer, that processes user's queries about the most
informative unvisited parts of the cube.
The mathematical basis for the expected value modeling is provided by
the classical Maximum Entropy principle.  Accordingly, the expected
values are computed so as to agree with every value that is already
visited while reducing assumptions about unvisited values to the
minimum by maximizing their entropy.  The most informative values are
defined as those that bring the new expected values closest to the
actual values.  We believe and prove through experiments that such a
user-in-the-loop exploration will enable much faster assimilation of
all significant information in the data compared to existing manual
explorations.
  }
}


@INPROCEEDINGS{sara00:mda,
  AUTHOR = {S. Sarawagi and G. Sathe},
  TITLE = {{$\bf{i^3}$: Intelligent, Interactive Investigaton of OLAP data cubes}},
  BOOKTITLE = {{Proc. ACM SIGMOD International Conf. on Management of Data (Demonstration section)}},
  ADDRESS = {{Dallas USA}},
  MONTH = {May},
  NOTE = {\url{http://www.it.iitb.ac.in/~sunita/icube}},
  YEAR = 2000,
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/sigmod00.pdf},
  ABSTRACT = {        
The goal of the \icube (eye cube) project is to enhance
multidimensional database products with a suite of advanced operators
to automate data analysis tasks that are currently handled through
manual exploration.  Most OLAP products are rather simplistic and rely
heavily on the user's intuition to manually drive the discovery
process.  Such ad hoc user-driven exploration gets tedious and
error-prone as data dimensionality and size increases.  We first
investigated how and why analysts currently explore the data cube and
then automated them using advanced operators that can be invoked
interactively like existing simple operators.
  }
}


@INPROCEEDINGS{sara99:diff,
  AUTHOR = {S. Sarawagi},
  TITLE = {Explaining differences in multidimensional aggregates},
  BOOKTITLE = {Proc. of the 25th Int'l Conference on Very Large
Databases (VLDB)},
  ADDRESS = {Scotland, UK},
  YEAR = 1999,
  PAGES = {42-53},
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/vldb99.pdf},
  ABSTRACT = {        
Our goal is to enhance multidimensional database systems with
advanced mining primitives.  Current Online Analytical Processing
(OLAP) products provide a minimal set of basic aggregate operators
like sum and average and a set of basic navigational operators like
drill-downs and roll-ups. These operators have to be driven entirely
by the analyst's intuition. Such ad hoc exploration gets tedious and
error-prone as data dimensionality and size increases. In earlier work
we presented one such advanced primitive where we pre-mined OLAP data
for exceptions, summarized the exceptions at appropriate levels, and
used them to lead the analyst to the interesting regions.

In this paper we present a second enhancement: a single operator that
lets the analyst get summarized reasons for drops or increases
observed at an aggregated level.  This eliminates the need to manually
drill-down for such reasons. We develop an information theoretic formulation for
expressing the reasons that is compact and easy to interpret. We
design a dynamic programming algorithm that requires only one pass of
the data improving significantly over our initial greedy algorithm
that required multiple passes.  In addition, the algorithm uses a
small amount of memory independent of the data size.  This allows easy
integration with existing OLAP products. We illustrate with our
prototype on the DB2/UDB ROLAP product with the Excel Pivot-table
frontend.  Experiments on this prototype using the OLAP data benchmark
demonstrate (1) scalability of our algorithm as the size and
dimensionality of the cube increases and (2) feasibility of getting
interactive answers even with modest hardware resources.
  }
}


@ARTICLE{sara01:diff,
  AUTHOR = {S. Sarawagi},
  TITLE = {iDiff: Informative Summarization of Differences in Multidimensional Aggregates},
  JOURNAL = {Data Mining and Knowledge Discovery journal},
  YEAR = 2001,
  VOLUME = 4,
  NUMBER = 5,
  PAGES = {255-276},
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/dmkd01.ps},
  ABSTRACT = {        
Multidimensional OLAP products provide an excellent opportunity for
integrating mining functionality because of their widespread
acceptance as a decision support tool and their existing heavy
reliance on manual, user-driven analysis.  Most OLAP products are
rather simplistic and rely heavily on the user's intuition to manually
drive the discovery process.  Such ad hoc user-driven exploration gets
tedious and error-prone as data dimensionality and size increases.
Our goal is to automate these manual discovery processes.  In this
paper we present an example of such automation through a \diff{}
operator that in a single step returns summarized reasons for drops or
increases observed at an aggregated level.  

We formulate this as a problem of summarizing the difference between
two multidimensional arrays of real numbers.  We develop a general
framework for such summarization and propose a specific formulation
for the case of OLAP aggregates.
We design an efficient dynamic programming algorithm that requires
only one pass of the data and uses a small amount of memory
independent of the data size.  This allows easy integration with
existing OLAP products. Our prototype has been tested on the Microsoft
OLAP server, DB2/UDB and Oracle 8i.  Experiments using the OLAP
benchmark demonstrate (1) scalability of our algorithm as the size and
dimensionality of the cube increases and (2) feasibility of getting
interactive answers  with modest hardware resources.
  }
}


@INPROCEEDINGS{csd98:time,
  AUTHOR = {S. Chakrabarti and S. Sarawagi and B. Dom},
  TITLE = {Mining surprising temporal patterns},
  BOOKTITLE = {Proc. of the Twenty fourth Int'l conf. on Very Large Databases (VLDB)},
  ADDRESS = {New York, USA},
  YEAR = {1998},
  MONTH = {Aug},
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/vldb98.ps.gz},
  ABSTRACT = {        
We propose a new notion of {\em surprising} temporal patterns in market
basket data, and algorithms to find such patterns.  This is distinct
from finding {\em frequent} patterns as addressed in the common mining
literature.  We argue that once the analyst is already familiar with
prevalent patterns in the data, the greatest incremental benefit is
likely to be from \emph{changes in the relationship} between item
frequencies over time.

A simple measure of surprise is the extent of departure from a model,
estimated using standard multivariate time series analysis.
Unfortunately, such estimation involves models, smoothing windows and
parameters whose optimal choices can vary dramatically from one
application to another.  In contrast, we propose a precise
characterization of surprise based on the {\em number of bits} in which a
basket sequence can be encoded under a carefully chosen coding scheme.
In this scheme it is inexpensive to encode sequences of itemsets that
have steady, hence likely to be well-known, correlation between items.
Conversely, a sequence with large code length hints at a possibly
surprising correlation.

Our notion of surprise also has the desirable property that the score
of a set of items is offset by anything surprising that the user may
already know from the marginal distribution of any proper subset.  No
parameters, such as support, confidence, or smoothing windows, need to
be estimated or specified by the user.

We experimented with real-life market basket data.  The algorithm
successfully rejected a large number of frequent sets of items that
bore obvious and steady complementary relations to each other, such as
cereal and milk.  Instead, our algorithm found itemsets that showed
statistically strong fluctuations in correlation over time.  These
items had no obviously complementary roles.
  }
}


@INPROCEEDINGS{sam98:exp,
  AUTHOR = {Sunita Sarawagi and Rakesh Agrawal and Nimrod Megiddo},
  TITLE = {Discovery-driven exploration of {OLAP} data cubes},
  BOOKTITLE = {Proc.  of the 6th Int'l Conference on Extending Database
		  Technology (EDBT)},
  YEAR = 1998,
  ADDRESS = {Valencia, Spain},
  MONTH = {March},
  PAGES = {168-182},
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/edbt98_ex.ps.gz},
  ABSTRACT = {        
A dominant application of OLAP data cubes is the interactive
exploration of business data by an analyst to find regions of anomaly.
Analysts often have to go through multiple drill-downs and selection
stages before finding anything interesting.  We propose a new
discovery-driven exploration paradigm that pre-mines the data for such
exceptions, summarizes the exceptions at appropriate levels, and leads
the analyst to the interesting regions.  We discuss the statistical
basis of our approach and describe computation techniques that make
this methodology feasible for large data bases.
  }
}


@INPROCEEDINGS{aadgnrs96:cube,
  AUTHOR = {S. Agarwal and R. Agrawal and P.M. Deshpande
                 and A. Gupta and J.F. Naughton and R. Ramakrishnan
		 and S. Sarawagi},
  TITLE = {On the Computation of Multidimensional Aggregates},
  BOOKTITLE = {Proc. of the 22nd Int'l Conference on Very Large Databases},
  ADDRESS = {Mumbai (Bombay), India},
  MONTH = {September},
  YEAR = 1996,
  PAGES = {506--521},
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/cube.ps.gz},
  ABSTRACT = {        
On-Line Analytical Processing (OLAP) applications often require
computation of multiple related group-bys. This paper presents fast
algorithms for computing a collection of group-bys. We focus first on
a special case of the aggregation problem---computation of the cube operator.
The cube operator requires computing group-bys on all possible combinations of a
list of attributes. Our algorithms extend hash-based and
sort-based grouping methods with several optimizations, like combining
common operations across multiple group-bys, caching, and using
pre-computed group-bys for computing other group-bys.  Empirical
evaluation using real-life OLAP data shows that the resulting
algorithms yield a factor of two to eight improvement over
straightforward methods and have running times very close to estimated
lower bounds.

We then extend our algorithms to compute a subset of a cube in which
the required set of aggregates does not include all combinations of
attributes.  Finally, we extend our algorithms to handle the common
OLAP case in which attributes are grouped along hierarchies defined on
them.
  }
}


@ARTICLE{aadgnrs96:cubeBook,
  AUTHOR = {S. Agarwal and R. Agrawal and P.M. Deshpande
                 and A. Gupta and J.F. Naughton and R. Ramakrishnan
		 and S. Sarawagi},
  TITLE = {On the Computation of Multidimensional Aggregates},
  BOOKTITLE = {Materialized views},
  ADDRESS = {Mumbai (Bombay), India},
  MONTH = {September},
  YEAR = 1996,
  PAGES = {506--521}
}


@TECHREPORT{sag96:cube,
  AUTHOR = {Sunita Sarawagi and Rakesh Agrawal and Ashish Gupta},
  TITLE = {On Computing the Data Cube},
  INSTITUTION = {IBM Almaden Research Center, San Jose, California},
  TYPE = {Research Report},
  NUMBER = {RJ 10026},
  YEAR = 1996,
  NOTE = {Available from {\tt http://www.almaden.ibm.com/cs/quest}},
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/cube_rj.ps.gz}
}


@TECHREPORT{ags96:model,
  AUTHOR = {Rakesh Agrawal and Ashish Gupta and Sunita Sarawagi},
  TITLE = {Modeling Multidimensional Databases},
  INSTITUTION = {IBM Almaden Research Center, San Jose, California},
  TYPE = {Research Report},
  YEAR = 1996,
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/md_model_rj.pdf},
  ABSTRACT = {        
Multidimensional databases have recently gained widespread acceptance
in the commercial world for supporting on-line analytical processing
(OLAP) applications.  We propose a hypercube-based data model and a
few basic algebraic operations that provide semantic foundation to
multidimensional databases and extend their current functionality.
The operators are composable, reorderable, and closed in application.
They make possible declarative specification and optimization of
multidimensional database queries that are currently specified
operationally.  The operators can be translated to SQL and can be
implemented on top of either a relational database system or a special
purpose multidimensional database engine.  In effect, they provide an
algebraic application programming interface (API) that allows the
separation of the frontend from the backend.  This formalization
provides a framework in which to study multidimensional databases and
opens several new research problems.
  }
}


@INPROCEEDINGS{ags97:model,
  AUTHOR = {Rakesh Agrawal and Ashish Gupta and Sunita Sarawagi},
  TITLE = {Modeling Multidimensional Databases},
  BOOKTITLE = {Proc. of the 13th Int'l Conference on Data Engineering (ICDE)},
  YEAR = 1997,
  ADDRESS = {Birmingham, U.K.},
  MONTH = {April},
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/de97_olap.ps.gz}
}


@INPROCEEDINGS{sta98:dbi,
  AUTHOR = {Sunita Sarawagi and Shiby Thomas and Rakesh Agrawal},
  TITLE = {Integrating association rule mining with databases: alternatives and implications},
  BOOKTITLE = {Proc. ACM SIGMOD International Conf. on Management of Data ({\bf Best paper award})},
  ADDRESS = {Seattle, USA},
  MONTH = {June},
  YEAR = 1998,
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/sigmod98_dbi.ps.gz},
  ABSTRACT = {        
Data mining on large data warehouses is becoming increasingly
important.  In support of this trend, we consider a spectrum of
architectural alternatives for coupling mining with database systems.
These alternatives include: loose-coupling through a SQL cursor
interface; encapsulation of a mining algorithm in a stored procedure;
caching the data to a file system on-the-fly and mining;
tight-coupling using primarily user-defined functions; and SQL
implementations for processing in the DBMS.  We comprehensively study
the option of expressing the mining algorithm in the form of SQL
queries using Association rule mining as a case in point.  We consider
four options in SQL-92 and six options in SQL enhanced with
object-relational extensions (SQL-OR).  Our evaluation of the different
architectural alternatives shows that from a performance perspective,
the Cache-Mine option is superior, although the performance of the
SQL-OR option is within a factor of two.  Both the Cache-Mine and the
SQL-OR approaches incur a higher storage penalty than the
loose-coupling approach which performance-wise is a factor of 3 to 4
worse than Cache-Mine.  The SQL-92 implementations were too slow to
qualify as a competitive option. We also compare these alternatives on
the basis of qualitative factors like automatic parallelization,
development ease, portability and inter-operability.
  }
}


@ARTICLE{sta00:dbi,
  AUTHOR = {Sunita Sarawagi and Shiby Thomas and Rakesh Agrawal},
  TITLE = {Integrating association rule mining with databases: alternatives and implications},
  JOURNAL = {Data Mining and Knowledge Discovery journal},
  YEAR = 2000,
  VOLUME = 4,
  NUMBER = {2-3},
  PAGES = {89-125},
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/dmkd00.pdf},
  ABSTRACT = {        
Data mining on large data warehouses is becoming increasingly important.
In support of this trend, we consider a spectrum of architectural alternatives for
coupling mining with database systems.
These alternatives include: loose-coupling 
through a SQL cursor interface; encapsulation
of a mining algorithm in a stored procedure; caching the
data to a file system on-the-fly and mining;
tight-coupling using primarily user-defined functions; 
and SQL implementations for processing in the DBMS.
We comprehensively study  the option of
expressing the mining algorithm in the form of SQL queries using
Association rule mining  as a case in point.
We consider four options in \sqlpure{} and six options in
SQL enhanced with object-relational extensions (\sqlor).
Our evaluation of the different architectural alternatives shows that
 from a performance perspective, the \ums{} option is superior,
 although the performance of the
\sqlor{} option  is within a factor of two.  Both the
\ums{} and the \sqlor{} approaches incur a higher storage penalty than
the loose-coupling approach which performance-wise is a factor of 3 to 4
worse than \ums.  The \sqlpure{} implementations were too slow
to qualify as a competitive option. We also compare these
alternatives on the basis of qualitative factors like automatic
parallelization, development ease, portability and inter-operability.
  }
}


@INPROCEEDINGS{ts98:dbi,
  AUTHOR = {S.Thomas and S.Sarawagi},
  TITLE = {Mining generalized association rules and sequential patterns 
	using SQL queries},
  BOOKTITLE = {Poster at the 4th International Conference on Knowledge 
Discovery and Data Mining (KDD-98)},
  YEAR = 1998,
  ADDRESS = {New York, USA},
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/kdd98_dbi.ps.gz},
  ABSTRACT = {       
Database integration of mining is becoming increasingly important with
the installation of larger and larger data warehouses built around
relational database technology. Most of the commercially available
mining systems integrate loosely (typically, through an ODBC or SQL
cursor interface) with data stored in DBMSs.  In cases where the
mining algorithm makes multiple passes over the data, it is also
possible to cache the data in flat files rather than retrieve multiple
times from the DBMS, to achieve better performance.  Recent studies
have found that for association rule mining, with carefully tuned SQL
formulations it is possible to achieve performance comparable to
systems that cache the data in files outside the DBMS. The SQL
implementation has potential for offering other qualitative advantages
like automatic parallelization, development ease, portability and
inter-operability with relational operators.  In this paper, we
present several alternatives for formulating as SQL queries
association rule generalized to handle items with hierarchies on them
and sequential pattern mining. This work illustrates that it is
possible to express computations that are significantly more
complicated than simple boolean associations, in SQL using essentially
the same framework.
  }
}


@ARTICLE{sar97:ind,
  AUTHOR = {Sunita Sarawagi},
  TITLE = {Indexing {OLAP} data},
  JOURNAL = {IEEE Data Engineering Bulletin (invited)},
  YEAR = 1997,
  MONTH = {Mar},
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/de.ps}
}


@ARTICLE{sar97:tert,
  AUTHOR = {Sunita Sarawagi},
  TITLE = {Execution reordering for tertiary memory access},
  JOURNAL = {IEEE Data Engineering Bulletin (invited)},
  YEAR = 1997,
  MONTH = {Sept}
}


@INPROCEEDINGS{ss96:tert,
  AUTHOR = {Sunita Sarawagi and Mike Stonebraker},
  TITLE = {Reordering Execution in Tertiary memory databases.},
  BOOKTITLE = {Proc. of the 22nd Int'l Conference on Very Large Databases (VLDB)},
  ADDRESS = {Mumbai (Bombay), India},
  MONTH = {September},
  YEAR = 1996,
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/reord.ps.gz},
  ABSTRACT = {        
In the relational data model the order of fetching data does not
affect the correctness of query semantics.  This flexibility has been
exploited in query optimization by statically reordering data
accesses.  However, once a query is optimized, it is processed in a
fixed order in most systems, with the result that data requests are
made in a fixed order.  Only limited forms of runtime reordering can
be provided by low-level device managers that schedule I/O requests
from multiple users, or by asynchronous batch prefetch requests.  More
aggressive reordering strategies are essential in scenarios where the
latency of access to data objects varies widely and dynamically, as in
tertiary devices.  This paper presents such a strategy.  Our key
innovation is to exploit dynamic reordering to match execution order
to the optimal data fetch order, in all parts of the plan-tree.  To
demonstrate the practicality of our approach and the impact of our
optimizations, we report on a prototype implementation based on
Postgres.  Using our system, typical I/O cost for queries on tertiary
memory databases is as much as an order of magnitude smaller than on
conventional systems.
  }
}


@INPROCEEDINGS{ss95:tert,
  AUTHOR = {Sarawagi, S.},
  TITLE = {Query processing and caching in tertiary memory databases},
  BOOKTITLE = {Proc. of the Twenty first Int'l conf. on Very Large Databases (VLDB)},
  YEAR = {1995},
  ADDRESS = {Zurich, Switzerland},
  MONTH = {Sep},
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/vldb95.ps.gz},
  ABSTRACT = {        
   With rapid increase in the number of applications that require access
to large amounts of data, it is becoming increasingly important for
database systems to handle tertiary storage devices.  The
characteristics of tertiary memory devices are very different from
secondary storage devices that conventional database systems are
designed for.  This requires new approaches to managing data location
and movement, together with query execution in a unified framework.
In this paper we present methods of scheduling queries, caching and
controlling the order of data retrieval for efficient operation in a
tertiary memory environment.  We show how careful interspersing of
queries and informed cache management can achieve remarkable
reductions in access time compared to conventional methods. Our
algorithms use a few model parameters for each tertiary memory device
and are thus designed to be portable across a wide variety of tertiary
memory devices and database types.  We are extending the Postgres
database system to implement the new query processing
strategies. Initial measurements on the prototype yield impressive
results.
  }
}


@INPROCEEDINGS{mss95:tert,
  AUTHOR = {Sarawagi, S.},
  TITLE = {Database systems for efficient access to tertiary memory},
  BOOKTITLE = {Proc. IEEE Mass Storage Symposium (MSS)},
  YEAR = {1995},
  MONTH = {Sep},
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/mss.ps.gz}
}


@INPROCEEDINGS{ss94:array,
  AUTHOR = {Sarawagi, S. and Stonebraker, M.},
  TITLE = {Efficient organization of large multidimensional arrays},
  BOOKTITLE = {Proc. tenth international 
   Conference on Data Engineering (ICDE)},
  ADDRESS = {Houston, Texas},
  YEAR = {1994},
  DOCUMENTURL = {http://www.it.iitb.ac.in/~sunita/papers/icde.ps.gz},
  ABSTRACT = {        
  Large multidimensional arrays are widely used in scientific and
engineering database applications.  In this paper, we present methods
of organizing arrays to make their access on secondary and tertiary
memory devices fast and efficient.  We have developed four techniques
for doing this: (1) storing the array in multidimensional ``chunks''
to minimize the number of blocks fetched, (2) reordering the chunked
array to minimize seek distance between accessed blocks, (3)
maintaining redundant copies of the array, each organized for a
different chunk size and ordering and (4) partitioning the array onto
platters of a tertiary memory device so as to minimize the number of
platter switches.  Our measurements on real data obtained from global
change scientists show that accesses on arrays organized using these
techniques are often an order of magnitude faster than on the
unoptimized data.
}
}


@TECHREPORT{sara94:tert,
  AUTHOR = {Sarawagi, S. and Stonebraker, M.},
  TITLE = {Single Query Optimization in Tertiary Memory},
  INSTITUTION = {University of California at Berkeley},
  NUMBER = {Sequoia 2000, S2k-94-45},
  YEAR = 1994
}


@MISC{ks03,
  AUTHOR = {Alok Kirpal and Sunita Sarawagi},
  TITLE = {Optimizing the evaluation of complex similarity predicates},
  HOWPUBLISHED = {Submitted for publication},
  YEAR = 2003
}


@MISC{gupta09techreport,
  AUTHOR = {Rahul Gupta and Sunita Sarawagi and Ajit A. Diwan},
  TITLE = {Generalized Collective Inference with Symmetric Clique Potentials},
  YEAR = {2009},
  URL = {http://arxiv.org/abs/0907.0589v2}
}


This file has been generated by bibtex2html 1.52