@inproceedings{bo2025user-simulation, author={Bo Ni and Leyao Wang and Yu Wang and Branislav Kveton and Franck Dernoncourt and Yu Xia and Hongjie Chen and Reuben Leura and Samyadeep Basu and Subhojyoti Mukherjee and Puneet Mathur and Nesreen Ahmed and Junda Wu and Li Li and Huixin Zhang and Ruiyi Zhang and Tong Yu and Sungchul Kim and Jiuxiang Gu and Zhengzhong Tu and Alexa Siu and Zichao Wang and David Seunghyun Yoon and Nedim Lipka and Namyong Park and Zihao Lin and Trung Bui and Yue Zhao and Tyler Derr and Ryan A. Rossi}, title={Large Language Models for Conversational User Simulation: A Comprehensive Survey}, booktitle={hal-05217179 preprint}, year={2025}, pages={}, acceptance={}, preprint={https://hal.science/hal-05217179v1/document}, pdf={pubs/LLM-based_User_Simulated_Data_Generation_Survey.pdf}, supp={https://hal.science/hal-05217179}, doi={}, slides={}, location={}, date={}, pubtype={journal}, abstract={User simulation has long played a vital role in computer science due to its potential to support a wide range of applications. Language, as the primary medium of human communication, forms the foundation of social interaction and behavior. Consequently, simulating conversational behavior has become a key area of study. Recent advancements in large language models (LLMs) have significantly catalyzed progress in this domain by enabling high-fidelity generation of synthetic user conversation. In this paper, we survey recent advancements in LLMbased conversational user simulation. We introduce a novel taxonomy covering user granularity and simulation objectives. Additionally, we systematically analyze core techniques and evaluation methodologies. We aim to keep the research community informed of the latest advancements in conversational user simulation and to further facilitate future research by identifying open challenges and organizing existing work under a unified framework.}, keywords={user simulation, user conversational simulation}, active={true}, } @inproceedings{gallegos2023bias, author={Gallegos, Isabel O and Rossi, Ryan A. and Barrow, Joe and Tanjim, Md Mehrab and Kim, Sungchul and Dernoncourt, Franck and Yu, Tong and Zhang, Ruiyi and Ahmed, Nesreen K.}, title={Bias and Fairness in Large Language Models: A Survey}, booktitle={arXiv preprint arXiv:2309.00770}, year={2023}, pages={}, acceptance={}, preprint={https://arxiv.org/pdf/2309.00770.pdf}, pdf={pubs/bias-and-fairness-in-LLMs-a-survey.pdf}, supp={}, doi={}, slides={}, code={https://github.com/i-gallegos/Fair-LLM-Benchmark}, location={}, date={}, pubtype={journal}, abstract={Rapid advancements of large language models (LLMs) have enabled the processing, understanding, and generation of human-like text, with increasing integration into systems that touch our social sphere. Despite this success, these models can learn, perpetuate, and amplify harmful social biases. In this paper, we present a comprehensive survey of bias evaluation and mitigation techniques for LLMs. We first consolidate, formalize, and expand notions of social bias and fairness in natural language processing, defining distinct facets of harm and introducing several desiderata to operationalize fairness for LLMs. We then unify the literature by proposing three intuitive taxonomies, two for bias evaluation, namely metrics and datasets, and one for mitigation. Our first taxonomy of metrics for bias evaluation disambiguates the relationship between metrics and evaluation datasets, and organizes metrics by the different levels at which they operate in a model: embeddings, probabilities, and generated text. Our second taxonomy of datasets for bias evaluation categorizes datasets by their structure as counterfactual inputs or prompts, and identifies the targeted harms and social groups; we also release a consolidation of publicly-available datasets for improved access. Our third taxonomy of techniques for bias mitigation classifies methods by their intervention during pre-processing, in-training, intra-processing, and post-processing, with granular subcategories that elucidate research trends. Finally, we identify open problems and challenges for future work. Synthesizing a wide range of recent research, we aim to provide a clear guide of the existing literature that empowers researchers and practitioners to better understand and prevent the propagation of bias in LLMs.}, keywords={Bias, fairness, large language models, bias mitigation, LLMs, generative models}, active={true}, } @inproceedings{goel-neurips22, author={Shashank Goel and Hritik Bansal and Sumit Bhatia and Ryan A. Rossi and Vishwa Vinay and Aditya Grover}, title={CyCLIP: Cyclic Contrastive Language-Image Pretraining}, booktitle={Advances in Neural Information Processing Systems (NeurIPS)}, year={2022}, pages={}, acceptance={}, preprint={https://arxiv.org/abs/2205.14459}, pdf={pubs/CyCLIP-neurips22.pdf}, supp={}, doi={}, slides={}, code={https://github.com/goel-shashank/CyCLIP}, location={}, date={}, pubtype={conference}, abstract={Recent advances in contrastive representation learning over paired image-text data have led to models such as CLIP that achieve state-of-the-art performance for zero-shot classification and distributional robustness. Such models typically require joint reasoning in the image and text representation spaces for downstream inference tasks. Contrary to prior beliefs, we demonstrate that the image and text representations learned via a standard contrastive objective are not interchangeable and can lead to inconsistent downstream predictions. To mitigate this issue, we formalize consistency and propose CyCLIP, a framework for contrastive representation learning that explicitly optimizes for the learned representations to be geometrically consistent in the image and text space. In particular, we show that consistent representations can be learned by explicitly symmetrizing (a) the similarity between the two mismatched image-text pairs (cross-modal consistency); and (b) the similarity between the image-image pair and the text-text pair (in-modal consistency). Empirically, we show that the improved consistency in CyCLIP translates to significant gains over CLIP, with gains ranging from 10%-24% for zero-shot classification accuracy on standard benchmarks (CIFAR-10, CIFAR-100, ImageNet1K) and 10%-27% for robustness to various natural distribution shifts.}, keywords={CLIP, contrastive representation learning, text-to-image generative model}, active={true}, } @inproceedings{zhou2022neural, title={Neural Point Process for Learning Spatiotemporal Event Dynamics}, author={Zihao Zhou and Xingyi Yang and Ryan Rossi and Handong Zhao and Rose Yu}, booktitle={Learning for Dynamics and Control Conference (L4DC)}, pages={777--789}, year={2022}, organization={PMLR}, preprint={}, pdf={pubs/zhou22a.pdf}, supp={}, doi={}, slides={}, code={https://github.com/Rose-STL-Lab/DeepSTPP}, location={}, date={}, pubtype={conference}, abstract={Learning the dynamics of spatiotemporal events is a fundamental problem. Neural point processes enhance the expressivity of point process models with deep neural networks. However, most existing methods only consider temporal dynamics without spatial modeling. We propose Deep Spatiotemporal Point Process (DeepSTPP), a deep dynamics model that integrates spatiotemporal point processes. Our method is flexible, efficient, and can accurately forecast irregularly sampled events over space and time. The key construction of our approach is the nonparametric space-time intensity function, governed by a latent process. The intensity function enjoys closed-form integration for the density. The latent process captures the uncertainty of the event sequence. We use amortized variational inference to infer the latent process with deep networks. Using synthetic datasets, we validate our model can accurately learn the true intensity function. On real-world benchmark datasets, our model demonstrates superior performance over state-of-the-art baselines.}, keywords={neural point processes, Deep Spatiotemporal Point Process}, active={true}, } @article{zhao2021automatic, title={Automatic Unsupervised Outlier Model Selection}, author={Yue Zhao and Ryan A. Rossi and Leman Akoglu}, journal={Advances in Neural Information Processing Systems (NeurIPS)}, volume={34}, pages={4489--4502}, year={2021}, preprint={}, pdf={pubs/NeurIPS-2021-automatic-unsupervised-outlier-model-selection.pdf}, supp={}, doi={}, slides={}, code={}, location={}, date={}, pubtype={conference}, abstract={Given an unsupervised outlier detection task on a new dataset, how can we automatically select a good outlier detection algorithm and its hyperparameter(s) (collectively called a model)? In this work, we tackle the unsupervised outlier model selection (UOMS) problem, and propose MetaOD, a principled, data-driven approach to UOMS based on meta-learning. The UOMS problem is notoriously challenging, as compared to model selection for classification and clustering, since (i) model evaluation is infeasible due to the lack of hold-out data with labels, and (ii) model comparison is infeasible due to the lack of a universal objective function. MetaOD capitalizes on the performances of a large body of detection models on historical outlier detection benchmark datasets, and carries over this prior experience to automatically select an effective model to be employed on a new dataset without any labels, model evaluations or model comparisons. To capture task similarity within our meta-learning framework, we introduce specialized meta-features that quantify outlying characteristics of a dataset. Extensive experiments show that selecting a model by MetaOD significantly outperforms no model selection (e.g. always using the same popular model or the ensemble of many) as well as other meta-learning techniques that we tailored for UOMS. Moreover upon (meta-)training, MetaOD is extremely efficient at test time; selecting from a large pool of 300+ models takes less than 1 second for a new task. We open-source MetaOD and our meta-learning database for practical use and to foster further research on the UOMS problem.}, keywords={}, active={true}, } @article{qian2022personalized, title={Personalized Visualization Recommendation}, author={Xin Qian and Ryan A. Rossi and Fan Du and Sungchul Kim and Eunyee Koh and Sana Malik and Tak Yeon Lee and Nesreen K. Ahmed}, journal={ACM Transactions on the Web (TWEB)}, volume={16}, number={3}, pages={1--47}, year={2022}, publisher={ACM New York, NY}, preprint={}, pdf={pubs/Personalized-Visualization-Recommendation-TWEB22.pdf}, supp={}, doi={}, slides={}, code={}, location={}, date={}, pubtype={conference}, abstract={Visualization recommendation work has focused solely on scoring visualizations based on the underlying dataset and not the actual user and their past visualization feedback. These systems recommend the same visualizations for every user, despite that the underlying user interests, intent, and visualization preferences are likely to be fundamentally different, yet vitally important. In this work, we formally introduce the problem of personalized visualization recommendation and present a generic learning framework for solving it. In particular, we focus on recommending visualizations personalized for each individual user based on their past visualization interactions (e.g., viewed, clicked, manually created) along with the data from those visualizations. More importantly, the framework can learn from visualizations relevant to other users, even if the visualizations are generated from completely different datasets. Experiments demonstrate the effectiveness of the approach as it leads to higher quality visualization recommendations tailored to the specific user intent and preferences. To support research on this new problem, we release our user-centric visualization corpus consisting of 17.4k users exploring 94k datasets with 2.3 million attributes and 32k user-generated visualizations.}, keywords={}, active={true}, } @article{chen2022graph, title={Graph Deep Factors for Probabilistic Time-series Forecasting}, author={Hongjie Chen and Ryan A. Rossi and Kanak Mahadik and Sungchul Kim and Hoda Eldardiry}, journal={ACM Transactions on Knowledge Discovery from Data (TKDD)}, year={2022}, pdf={pubs/TKDD-GraphDF.pdf}, preprint={}, supp={}, doi={}, slides={}, code={}, location={}, date={}, pubtype={conference}, abstract={}, keywords = {Incremental Online Learning, Time-series Forecasting, Graph Neural Network}, active={true}, } @article{verma2022robustness, title={Robustness of Fusion-based Multimodal Classifiers to Cross-Modal Content Dilutions}, author={Gaurav Verma and Vishwa Vinay and Ryan A. Rossi and Srijan Kumar}, journal={EMNLP}, year={2022}, preprint={https://arxiv.org/abs/2211.02646}, pdf={pubs/EMNLP22-Robustness-Multimodal.pdf}, supp={}, doi={}, slides={}, code={https://claws-lab.github.io/multimodal-robustness/}, location={}, date={}, pubtype={conference}, abstract={As multimodal learning finds applications in a wide variety of high-stakes societal tasks, investigating their robustness becomes important. Existing work has focused on understanding the robustness of vision-and-language models to imperceptible variations on benchmark tasks. In this work, we investigate the robustness of multimodal classifiers to cross-modal dilutions - a plausible variation. We develop a model that, given a multimodal (image + text) input, generates additional dilution text that (a) maintains relevance and topical coherence with the image and existing text, and (b) when added to the original text, leads to misclassification of the multimodal input. Via experiments on Crisis Humanitarianism and Sentiment Detection tasks, we find that the performance of task-specific fusion-based multimodal classifiers drops by 23.3% and 22.5%, respectively, in the presence of dilutions generated by our model. Metric-based comparisons with several baselines and human evaluations indicate that our dilutions show higher relevance and topical coherence, while simultaneously being more effective at demonstrating the brittleness of the multimodal classifiers. Our work aims to highlight and encourage further research on the robustness of deep multimodal models to realistic variations, especially in human-facing societal applications.}, keywords={}, active={true}, } @inproceedings{icml22-OnlineNDPPs, title = {One-Pass Algorithms for MAP Inference of Nonsymmetric Determinantal Point Processes}, author = {Aravind Reddy and Ryan A. Rossi and Zhao Song and Anup Rao and Tung Mai and Nedim Lipka and Gang Wu and Eunyee Koh and Nesreen Ahmed}, booktitle = {Proceedings of the 39th International Conference on Machine Learning (ICML)}, pages = {18463--18482}, year = {2022}, pdf={pubs/ICML22-OnlineNDPP.pdf}, preprint={}, supp={}, doi={}, slides={}, code={}, location={}, date={}, pubtype={conference}, abstract = {In this paper, we initiate the study of one-pass algorithms for solving the maximum-a-posteriori (MAP) inference problem for Non-symmetric Determinantal Point Processes (NDPPs). In particular, we formulate streaming and online versions of the problem and provide one-pass algorithms for solving these problems. In our streaming setting, data points arrive in an arbitrary order and the algorithms are constrained to use a single-pass over the data as well as sub-linear memory, and only need to output a valid solution at the end of the stream. Our online setting has an additional requirement of maintaining a valid solution at any point in time. We design new one-pass algorithms for these problems and show that they perform comparably to (or even better than) the offline greedy algorithm while using substantially lower memory.}, keywords = {Non-symmetric Determinantal Point Processes, NDPPs, One-pass Algorithms}, active={true}, } @inproceedings{park2022cgc, title={CGC: Contrastive Graph Clustering for Community Detection and Tracking}, author={Namyong Park and Ryan Rossi and Eunyee Koh and Iftikhar Ahamath Burhanuddin and Sungchul Kim and Fan Du and Nesreen Ahmed and Christos Faloutsos}, booktitle={Proceedings of the ACM Web Conference (WWW)}, pages={1115--1126}, year={2022}, pdf={pubs/Contrastive-Graph-Clustering-WWW22.pdf}, preprint={}, supp={}, doi={}, slides={}, code={}, location={}, date={}, pubtype={conference}, abstract = {}, keywords = {}, active={true}, } @inproceedings{ullah2021machine, title={Machine Unlearning via Algorithmic Stability}, author={Enayat Ullah and Tung Mai and Anup Rao and Ryan A. Rossi and Raman Arora}, booktitle={Conference on Learning Theory (COLT)}, pages={4126--4142}, year={2021}, organization={PMLR}, pdf={http://proceedings.mlr.press/v134/ullah21a/ullah21a.pdf}, preprint={}, supp={}, doi={}, slides={}, code={}, location={}, date={}, pubtype={conference}, abstract = {}, keywords = {}, active={true}, } @inproceedings{abdallah2022autoforecast, title={AutoForecast: Automatic Time-Series Forecasting Model Selection}, author={Mustafa Abdallah and Ryan Rossi and Kanak Mahadik and Sungchul Kim and Handong Zhao and Saurabh Bagchi}, booktitle={Proceedings of the 31st ACM International Conference on Information \& Knowledge Management (CIKM)}, pages={5--14}, year={2022}, pdf={pubs/AutoForecast-CIKM22.pdf}, preprint={}, supp={}, doi={}, slides={}, code={}, location={}, date={}, pubtype={conference}, abstract = {}, keywords = {}, active={true}, } @inproceedings{hoang2022automars, title={AutoMARS: Searching to Compress Multi-Modality Recommendation Systems}, author={Duc Hoang and Haotao Wang and Handong Zhao and Ryan Rossi and Sungchul Kim and Kanak Mahadik and Zhangyang Wang}, booktitle={Proceedings of the 31st ACM International Conference on Information \& Knowledge Management}, pages={727--736}, year={2022}, pdf={pubs/AutoMARS-CIKM22.pdf}, preprint={}, supp={}, doi={}, slides={}, code={}, location={}, date={}, pubtype={conference}, abstract = {}, keywords = {}, active={true}, } @inproceedings{yuan2022clustering, title={Clustering-based Unsupervised Generative Relation Extraction}, author={Chenhan Yuan and Ryan Rossi and Andrew Katz and Hoda Eldardiry}, booktitle={IEEE BigData}, year={2022}, pdf={pubs/CURE-BigData22.pdf}, preprint={}, supp={}, doi={}, slides={}, code={}, location={}, date={}, pubtype={conference}, abstract = {}, keywords = {}, active={true}, } @inproceedings{xiao2022imarker, title={iMarker: Instant and True-to-scale AR with Invisible Markers}, author={Chang Xiao and Ryan Rossi and Eunyee Koh}, booktitle={Proceedings of the 35th Annual ACM Symposium on User Interface Software and Technology (UIST)}, year={2022}, pdf={pubs/iMarker-UIST22.pdf}, preprint={}, supp={}, doi={}, slides={}, code={}, location={}, date={}, pubtype={conference}, abstract = {}, keywords = {}, active={true}, } @inproceedings{jiang2022compression, title={Task-Oriented Near-Lossless Burst Compression}, author={Weixin Jiang and Gang Wu and Vishy Swaminathan and Stefano Petrangeli and Haoliang Wang and Ryan A. Rossi and Nedim Lipka}, booktitle={ISM}, year={2022}, pdf={pubs/Task_Oriented_Near_Lossless_Burst_Image_Compression.pdf}, preprint={}, supp={}, doi={}, slides={}, code={}, location={}, date={}, pubtype={conference}, abstract = {}, keywords = {}, active={true}, } @inproceedings{rezayi2021framework, title={A Framework for Knowledge-Derived Query Suggestions}, author={Saed Rezayi and Nedim Lipka and Vishwa Vinay and Ryan A. Rossi and Franck Dernoncourt and Tracy H. King and Sheng Li}, booktitle={IEEE International Conference on Big Data (IEEE BigData)}, pages={510--518}, year={2021}, pdf={pubs/A_Framework_for_Knowledge_Derived_Query_Suggestion.pdf}, preprint={}, supp={}, doi={}, slides={}, code={}, location={}, date={}, pubtype={conference}, abstract = {}, keywords = {}, active={false}, } @inproceedings{weld2022adjusting, title={Adjusting for confounders with text: Challenges and an empirical evaluation framework for causal inference}, author={Galen Weld and Peter West and Maria Glenski and David Arbour and Ryan A. Rossi and Tim Althoff}, booktitle={Proceedings of the International AAAI Conference on Web and Social Media (ICWSM)}, volume={16}, pages={1109--1120}, year={2022}, pdf={pubs/ICWSM22.pdf}, preprint={}, supp={}, doi={}, slides={}, code={}, location={}, date={}, pubtype={conference}, abstract = {}, keywords = {}, active={true}, } @inproceedings{chen2021context, title={Context Integrated Relational Spatio-Temporal Resource Forecasting}, author={Hongjie Chen and Ryan A. Rossi and Kanak Mahadik and Hoda Eldardiry}, booktitle={2021 IEEE International Conference on Big Data (IEEE BigData)}, pages={1359--1368}, year={2021}, pdf={pubs/CIGNN.pdf}, preprint={}, supp={}, doi={}, slides={}, code={}, location={}, date={}, pubtype={conference}, abstract = {}, keywords = {}, active={true}, } @inproceedings{jin2022generalizing, title={On Generalizing Static Node Embedding to Dynamic Settings}, author={Di Jin and Sungchul Kim and Ryan A. Rossi and Danai Koutra}, booktitle={Proceedings of the Fifteenth ACM International Conference on Web Search and Data Mining}, pages={410--420}, year={2022}, pdf={pubs/WSDM22.pdf}, preprint={}, supp={}, doi={}, slides={}, code={}, location={}, date={}, pubtype={conference}, abstract = {}, keywords = {}, active={true}, } @inproceedings{qin2022external, title={External Knowledge Infusion for Tabular Pre-training Models with Dual-adapters}, author={Can Qin and Sungchul Kim and Handong Zhao and Tong Yu and Ryan A. Rossi and Yun Fu}, booktitle={Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining (KDD)}, pages={1401--1409}, year={2022}, pdf={pubs/Tabular_Pretraining_KDD22.pdf}, preprint={}, supp={}, doi={}, slides={}, code={}, location={}, date={}, pubtype={conference}, abstract = {}, keywords = {}, active={true}, } @inproceedings{oh2022implicit, title={Implicit Session Contexts for Next-Item Recommendations}, author={Sejoon Oh and Ankur Bhardwaj and Jongseok Han and Sungchul Kim and Ryan A. Rossi and Srijan Kumar}, booktitle={Proceedings of the 31st ACM International Conference on Information \& Knowledge Management}, pages={4364--4368}, year={2022}, pdf={pubs/CIKM22-Implicit-Session-Contexts-for-Next-Item-Recommendations.pdf}, preprint={}, supp={}, doi={}, slides={}, code={}, location={}, date={}, pubtype={conference}, abstract = {}, keywords = {}, active={true}, } @inproceedings{mehta2021open, title={Open-Domain Trending Hashtag Recommendation for Videos}, author={Mehta, Swapneel and Sarkhel, Somdeb and Chen, Xiang and Mitra, Saayan and Swaminathan, Viswanathan and Rossi, Ryan and Aminian, Ali and Guo, Han and Garg, Kshitiz}, booktitle={IEEE International Symposium on Multimedia (ISM)}, pages={174--181}, year={2021}, pdf={https://doi.org/10.1109/ISM52913.2021.00035}, preprint={}, supp={}, doi={}, slides={}, code={}, location={}, date={}, pubtype={conference}, abstract = {}, keywords = {}, active={true}, } @article{sarma2022evaluating, title={Evaluating the Use of Uncertainty Visualisations for Imputations of Data Missing At Random in Scatterplots}, author={Abhraneel Sarma and Shunan Guo and Jane Hoffswell and Ryan Rossi and Fan Du and Eunyee Koh and Matthew Kay}, journal={IEEE VIS}, year={2022}, pdf={pubs/VIS22-Uncertainty-Visualizations.pdf}, preprint={}, supp={}, doi={}, slides={}, code={}, location={}, date={}, pubtype={conference}, abstract = {}, keywords = {}, active={true}, } @inproceedings{xu2022arshopping, title={ARShopping: In-Store Shopping Decision Support Through Augmented Reality and Immersive Visualization}, author={Bingjie Xu and Shunan Guo and Eunyee Koh and Jane Hoffswell and Ryan Rossi and Fan Du}, booktitle={IEEE VIS}, year={2022}, pdf={https://arxiv.org/pdf/2207.07643.pdf}, preprint={}, supp={}, doi={}, slides={}, code={}, location={}, date={}, pubtype={conference}, abstract = {}, keywords = {}, active={true}, } @inproceedings{vis2022personal, title={Let’s Get Personal: Exploring the Design of Personalized Visualizations}, author={Beleicia Bullock and Shunan Guo and Eunyee Koh and Ryan Rossi and Fan Du and Jane Hoffswell}, booktitle={IEEE VIS}, year={2022}, pdf={pubs/2022-PersonalizedVisualization-VIS.pdf}, preprint={}, supp={}, doi={}, slides={}, code={}, location={}, date={}, pubtype={conference}, abstract = {}, keywords = {}, active={true}, } @inproceedings{WWW22-VisGNN, author = {Fayokemi Ojo and Ryan A. Rossi and Jane Hoffswell and Shunan Guo and Fan Du and Sungchul Kim and Chang Xiao and Eunyee Koh}, title = {VisGNN: Personalized Visualization Recommendation via Graph Neural Networks}, year = {2022}, booktitle = {Proceedings of the ACM Web Conference (WWW)}, pages = {2810–2818}, preprint={}, pdf={pubs/WWW22-VisGNN.pdf}, supp={}, doi={}, slides={}, code={}, location={}, date={}, pubtype={conference}, doi = {10.1145/3485447.3512001}, abstract = {In this work, we develop a Graph Neural Network (GNN) framework for the problem of personalized visualization recommendation. The GNN-based framework first represents the large corpus of datasets and visualizations from users as a large heterogeneous graph. Then, it decomposes a visualization into its data and visual components, and then jointly models each of them as a large graph to obtain embeddings of the users, attributes (across all datasets in the corpus), and visual-configurations. From these user-specific embeddings of the attributes and visual-configurations, we can predict the probability of any visualization arising from a specific user. Finally, the experiments demonstrated the effectiveness of using graph neural networks for automatic and personalized recommendation of visualizations to specific users based on their data and visual (design choice) preferences. To the best of our knowledge, this is the first such work to develop and leverage GNNs for this problem.}, keywords = {User Modeling, Graph Neural Networks, Personalized Visualization Recommendation, Attribute Recommendation, Personalization}, location = {Virtual Event, Lyon, France}, active={true}, } @inproceedings{2022-cicero-responsive-grammar, title = {Cicero: A Declarative Grammar for Responsive Visualization}, author = {Hyeok Kim and Ryan Rossi and Fan Du and Eunyee Koh and Shunan Guo and Jessica Hullman and Jane Hoffswell}, booktitle = {ACM Human Factors in Computing Systems (CHI)}, year = {2022}, preprint={}, pdf={pubs/CiceroResponsiveGrammar-CHI22.pdf}, supp={}, doi={}, slides={}, code={}, location={}, date={}, pubtype={conference}, abstract={}, keywords={}, active={true}, } @inproceedings{2022-codas-ba-report-authoring, title = {CODAS: Integrating Business Analytics and Report Authoring}, author = {Zhuohao Zhang and Sana Malik and Shunan Guo and Jane Hoffswell and Ryan Rossi and Fan Du and Eunyee Koh}, booktitle = {EuroVis Workshop on Visual Analytics (EuroVA)}, year = {2022}, preprint={}, pdf={pubs/}, supp={}, doi={}, slides={}, code={}, location={}, date={}, pubtype={conference}, abstract={}, keywords={}, active={true}, } @inproceedings{zhang2022understanding, title={Understanding Business Analysts’ Needs for Data Report Authoring}, author={Zhuohao Zhang and Sana Malik and Shunan Guo and Jane Hoffswell and Ryan A. Rossi and Fan Du and Eunyee Koh}, year={2022}, booktitle={EuroVis Workshop on Visual Analytics (EuroVA)}, publisher={The Eurographics Association}, preprint={}, pdf={pubs/ReportAuthoring-EuroVA22.pdf}, supp={}, doi={}, slides={}, code={}, location={}, date={}, pubtype={conference}, abstract={}, keywords={}, active={true}, } @inproceedings{oh2021influence, title={Influence-guided Data Augmentation for Neural Tensor Completion}, author={Sejoon Oh and Sungchul Kim and Ryan A. Rossi and Srijan Kumar}, booktitle={Proceedings of the 30th ACM International Conference on Information & Knowledge Management (CIKM)}, pages={1386--1395}, year={2021}, preprint={}, pdf={pubs/dain-oh-cikm2021.pdf}, supp={}, doi={}, slides={}, code={}, location={}, date={}, pubtype={conference}, abstract={}, keywords={}, active={true}, } @inproceedings{rossi2021closing, title={From Closing Triangles to Higher-Order Motif Closures for Better Unsupervised Online Link Prediction}, author={Ryan A. Rossi and Anup Rao and Sungchul Kim and Eunyee Koh and Nesreen K. Ahmed and Gang Wu}, booktitle={Proceedings of the 30th ACM International Conference on Information & Knowledge Management (CIKM)}, pages={4085--4093}, year={2021}, preprint={}, pdf={pubs/CIKM21-Higher-Order-Motif-Closures.pdf}, supp={}, doi={}, slides={}, code={}, location={}, date={}, pubtype={conference}, abstract={}, keywords={}, active={true}, } @inproceedings{rezayi2021framework, title={A Framework for Knowledge-Derived Query Suggestions}, author={Saed Rezayi and Nedim Lipka and Vishwa Vinay and Ryan A. Rossi and Franck Dernoncourt and Tracy H. King and Sheng Li}, booktitle={IEEE International Conference on Big Data (IEEE BigData)}, pages={510--518}, year={2021}, preprint={}, pdf={pubs/Knowledge_Derived_Query_Suggestion_IEEE_BigData21.pdf}, supp={}, doi={}, slides={}, code={}, location={}, date={}, pubtype={conference}, abstract={}, keywords={}, active={true}, } @inproceedings{kim2021automated, title={An automated Approach to Reasoning about Task-oriented Insights in Responsive Visualization}, author={Hyeok Kim and Ryan Rossi and Abhraneel Sarma and Dominik Moritz and Jessica Hullman}, booktitle={IEEE VIS}, pages={129--139}, year={2021}, preprint={https://arxiv.org/abs/2107.08141}, pdf={pubs/VIS21.pdf}, supp={https://medium.com/multiple-views-visualization-research-explained/how-can-we-tell-when-a-responsive-visualization-retains-the-message-of-a-source-view-fc30e70449dd}, doi={}, slides={}, code={}, location={}, date={}, pubtype={conference}, abstract={Authors often transform a large screen visualization for smaller displays through rescaling, aggregation and other techniques when creating visualizations for both desktop and mobile devices (i.e., responsive visualization). However, transformations can alter relationships or patterns implied by the large screen view, requiring authors to reason carefully about what information to preserve while adjusting their design for the smaller display. We propose an automated approach to approximating the loss of support for task-oriented visualization insights (identification, comparison, and trend) in responsive transformation of a source visualization. We operationalize identification, comparison, and trend loss as objective functions calculated by comparing properties of the rendered source visualization to each realized target (small screen) visualization. To evaluate the utility of our approach, we train machine learning models on human ranked small screen alternative visualizations across a set of source visualizations. We find that our approach achieves an accuracy of 84% (random forest model) in ranking visualizations. We demonstrate this approach in a prototype responsive visualization recommender that enumerates responsive transformations using Answer Set Programming and evaluates the preservation of task-oriented insights using our loss measures. We discuss implications of our approach for the development of automated and semi-automated responsive visualization recommendation.}, keywords={}, active={true}, } @inproceedings{park-icml20, author={Youngsuk Park and Ryan A. Rossi and Zheng Wen and Gang Wu and Handong Zhao}, title={Structured Policy Iteration for Linear Quadratic Regulator}, booktitle={International Conference on Machine Learning (ICML)}, year={2020}, pages={}, acceptance={}, preprint={}, pdf={pubs/park-icml20.pdf}, supp={}, doi={}, slides={}, location={Vienna, Austria}, date={}, pubtype={conference}, abstract={Linear quadratic regulator (LQR) is one of the most popular frameworks to tackle continuous Markov decision process tasks. With its fundamental theory and tractable optimal policy, LQR has been revisited and analyzed in recent years, in terms of reinforcement learning scenarios such as the model-free or model-based setting. In this paper, we introduce the Structured Policy Iteration (S-PI) for LQR, a method capable of deriving a structured linear policy. Such a structured policy with (block) sparsity or low-rank can have significant advantages over the standard LQR policy: more interpretable, memory-efficient, and well-suited for the distributed setting. In order to derive such a policy, we first cast a regularized LQR problem when the model is known. Then, our Structured Policy Iteration (S-PI) algorithm, which takes a policy evaluation step and a policy improvement step in an iterative manner, can solve this regularized LQR efficiently. We further extend the S-PI algorithm to the model-free setting where a smoothing procedure is adopted to estimate the gradient. In both the known-model and model-free setting, we prove convergence analysis under the proper choice of parameters. Finally, the experiments demonstrate the advantages of S-PI in terms of balancing the LQR performance and level of structure by varying the weight parameter.}, keywords={Linear Quadratic Regulator, structured policy, regularized LQR, block sparsity, low-rank policy, reinforcement learning}, active={true}, } @inproceedings{zheng2022network, title={Network Report: A Structured Description for Network Datasets}, author={Xinyi Zheng and Ryan A. Rossi and Nesreen K. Ahmed and Dominik Moritz}, booktitle={CIKM}, pages={3694--3704}, year={2022}, preprint={https://arxiv.org/abs/2206.03635}, pdf={pubs/NetworkReport-CIKM22.pdf}, supp={}, doi={}, slides={}, code={}, location={}, date={}, pubtype={conference}, abstract={The rapid development of network science and technologies depends on shareable datasets. Currently, there is no standard practice for reporting and sharing network datasets. Some network dataset providers only share links, while others provide some contexts or basic statistics. As a result, critical information may be unintentionally dropped, and network dataset consumers may misunderstand or overlook critical aspects. Inappropriately using a network dataset can lead to severe consequences (e.g., discrimination) especially when machine learning models on networks are deployed in high-stake domains. Challenges arise as networks are often used across different domains (e.g., network science, physics, etc) and have complex structures. To facilitate the communication between network dataset providers and consumers, we propose network report. A network report is a structured description that summarizes and contextualizes a network dataset. Network report extends the idea of dataset reports (e.g., Datasheets for Datasets) from prior work with network-specific descriptions of the non-i.i.d. nature, demographic information, network characteristics, etc. We hope network reports encourage transparency and accountability in network research and development across different fields.}, keywords={}, active={true}, } @inproceedings{CINES, author={Nesreen Ahmed and Richard Alo and Catherine Amelink and Young Yun Baek and Aashish Chudhary and Kristy Collins and Albert Esterline and Edward Fox and Geoffrey Fox and Aric Hagberg and Ron Kenyon and Chris J. Kuhlman and Jure Leskovec and Dustin Machi and Madhav V. Marathe and Nataragan Meghanathan and Yasuo Miyasaki and Judy Qiu and Naren Ramakrishnan and S. S. Ravi and Ryan Rossi and Roc Sosic and Gregor von Laszewski}, title={net.science: A Cyberinfrastructure for Sustained Innovation in Network Science and Engineering}, booktitle={Gateways}, year={2020}, pages={}, acceptance={}, preprint={}, pdf={pubs/CINES-Gateway20.pdf}, supp={}, doi={}, slides={}, location={}, date={}, pubtype={}, abstract={Networks have entered the mainstream lexicon over the last ten years. This coincides with the pervasive use of networks in a host of disciplines of interest to industry and academia, including biology, neurology, genomics, psychology, social sciences, economics, psychology, and cyber-physical systems and infrastructure. Several dozen journals and conferences regularly contain articles related to networks. Yet, there are no generalpurpose cyberinfrastructures (CI) that can be used across these varied disciplines and domains. Furthermore, while there are scientific gateways that include some network science capabilities for particular domains (e.g., biochemistry, genetics), there are no general-purpose network-based scientific gateways. In this work, we introduce net.science, a CI for Network Engineering and Science, that is designed to be a community resource. This paper provides an overview of net.science, addressing key requirements and concepts, CI components, the types of applications that our CI will support, and various dimensions of our evaluation process.}, keywords={Cyberinfrastructure, network science, net.science, CINES}, active={true}, } @inproceedings{rossi20tkdd-roles, author={Ryan A. Rossi and Di Jin and Sungchul Kim and Nesreen K. Ahmed and Danai Koutra and John Boaz Lee}, title={On Proximity and Structural Role-based Embeddings in Networks: Misconceptions, Techniques, and Applications}, booktitle={Transactions on Knowledge Discovery from Data (TKDD)}, year={2020}, pages={36}, acceptance={}, preprint={}, pdf={pubs/rossi20-tkdd.pdf}, supp={}, doi={}, slides={}, location={}, date={August 17, 2019}, pubtype={journal}, abstract={Structural roles define sets of structurally similar nodes that are more similar to nodes inside the set than outside, whereas communities define sets of nodes with more connections inside the set than outside. Roles based on structural similarity and communities based on proximity are fundamentally different but important complementary notions. Recently, the notion of structural roles has become increasingly important and has gained a lot of attention due to the proliferation of work on learning representations (node/edge embeddings) from graphs that preserve the notion of roles. Unfortunately, recent work has sometimes confused the notion of structural roles and communities (based on proximity) leading to misleading or incorrect claims about the capabilities of network embedding methods. As such, this paper seeks to clarify the misconceptions and key differences between structural roles and communities, and formalize the general mechanisms (e.g., random walks, feature diffusion) that give rise to community or role-based structural embeddings. We theoretically prove that embedding methods based on these mechanisms result in either community or role-based structural embeddings. These mechanisms are typically easy to identify and can help researchers quickly determine whether a method preserves community or role-based embeddings. Furthermore, they also serve as a basis for developing new and improved methods for community or role-based structural embeddings. Finally, we analyze and discuss applications and data characteristics where community or role-based embeddings are most appropriate.}, keywords={Role-based structural embedding, roles, structural similarity, community-based embedding, proximity, role discovery, positions, structural embeddings, structural node embeddings, proximity-based node embeddings, node embeddings, network representation learning, graphlets}, active={true}, } @inproceedings{qu2021vis, author={Zening Qu and Fan Du and Ryan A. Rossi and Bill Howe}, title={Aunt Lily Can Say Her Visualizations: Directing Analysis, Design, and Storytelling in Natural Language}, booktitle={NL VIZ: Workshop on Exploring Opportunities and Challenges for Natural Language Techniques to Support Visual Analysis}, year={2021}, pages={}, acceptance={}, preprint={}, pdf={pubs/qu2021vis.pdf}, supp={}, doi={}, slides={}, date={}, pubtype={workshop}, abstract={We envision Lily, a mixed-initiative collaborative authoring platform for anyone to direct, in their own words, data fusing, analysis, visualization design, and storytelling, end-to-end. We foresee one lightweight user interface that flexibly accepts novice or expert language, concise or elaborate, declarative or imperative -- that encodes coarse- or fine-grain controls over the outcome. Lily's platform regulates the sharing and consumption of data tables, insights, story templates, visualizations, and components. Natural language can manipulate data and designs if associated with custom recommended data stories under the same language model. Since templates are strings, what can be expressed by popular authoring tools can be re-expressed by Lily.}, keywords={text to visualization, visualization, NLP, NL4Vis}, active={true}, } @inproceedings{jmlr20, title={Ensemble Learning for Relational Data}, author={Hoda Eldardiry and Jennifer Neville and Ryan A. Rossi}, booktitle={Journal of Machine Learning Research (JMLR)}, year={2020}, pdf={http://jmlr.org/papers/volume21/14-368/14-368.pdf}, preprint={}, supp={http://jmlr.org/papers/v21/14-368.html}, pubtype={journal}, abstract={In this work, we present a theoretical analysis framework for relational ensemble models. We show that ensembles of collective classifiers can improve predictions for graph data by reducing errors due to variance in both learning \emph{and} inference. In addition, we propose a relational ensemble framework that combines a relational ensemble learning approach with a relational ensemble inference approach for collective classification. This combination allows the relational ensemble to reduce errors due to variance in both learning and inference. The proposed ensemble techniques are applicable for both single and multiple graph settings. Experiments on both synthetic and real-world data demonstrate the effectiveness of the proposed framework. Finally, our experimental results support the theoretical analysis and confirm that ensemble algorithms that explicitly focus on both learning and inference processes and aim at reducing errors associated with both, are the best performers.}, keywords={Ensemble learning, relational ensemble, collective classification, collective inference, bias-variance decomposition, relational machine learning}, active={true}, } @inproceedings{rossi-heterogeneous-graphlets-tkdd, author={Ryan A. Rossi and Nesreen K. Ahmed and Aldo Carranza and David Arbour and Anup Rao and Sungchul Kim and Eunyee Koh}, title={Heterogeneous Graphlets}, booktitle={Transactions on Knowledge Discovery from Data (TKDD)}, year={2020}, pages={43}, pdf={pubs/heterogeneous-graphlets-TKDD20.pdf}, preprint={}, supp={}, slidesOld={slides/KDD19-Heterogeneous-Graphlets.pdf}, slideshare={}, posterOld={poster/KDD19-Heterogeneous-Graphlets.pdf}, pubtype={journal}, abstract={In this paper, we introduce a generalization of graphlets to heterogeneous networks called typed graphlets. Informally, typed graphlets are small typed induced subgraphs. Typed graphlets generalize graphlets to rich heterogeneous networks as they explicitly capture the higher-order typed connectivity patterns in such networks. To address this problem, we describe a general framework for counting the occurrences of such typed graphlets. The proposed algorithms leverage a number of combinatorial relationships for different typed graphlets. For each edge, we count a few typed graphlets, and with these counts along with the combinatorial relationships, we obtain the exact counts of the other typed graphlets in o(1) constant time. Notably, the worst-case time complexity of the proposed approach matches the time complexity of the best known untyped algorithm. In addition, the approach lends itself to an efficient lock-free and asynchronous parallel implementation. While there are no existing methods for typed graphlets, there has been some work that focused on computing a different and much simpler notion called colored graphlet. The experiments confirm that our proposed approach is orders of magnitude faster and more space-efficient than methods for computing the simpler notion of colored graphlet. Unlike these methods that take hours on small networks, the proposed approach takes only seconds on large networks with millions of edges. Notably, since typed graphlet is more general than colored graphlet (and untyped graphlets), the counts of various typed graphlets can be combined to obtain the counts of the much simpler notion of colored graphlets. The proposed methods give rise to new opportunities and applications for typed graphlets.}, keywords={Heterogeneous graphlets, colored network motifs, typed motifs, position-aware typed graphlets, typed graphlets, colored graphlets, colored motifs, colored network motifs, network motifs, heterogeneous networks, labeled graphs, signed networks, k-partite graphs}, active={true}, } @inproceedings{from-comm-to-structural-role-embeddings, author={Ryan A. Rossi and Di Jin and Sungchul Kim and Nesreen K. Ahmed and Danai Koutra and John Boaz Lee}, title={From Community to Role-based Graph Embeddings}, booktitle={arXiv:1908.08572}, year={2019}, pages={}, acceptance={}, preprint={pubs/from-comm-to-structural-role-based-embeddings.pdf}, pdf={https://arxiv.org/pdf/1908.08572.pdf}, supp={}, doi={}, slides={}, location={}, date={August 17, 2019}, pubtype={}, abstract={Roles are sets of structurally similar nodes that are more similar to nodes inside the set than outside, whereas communities are sets of nodes with more connections inside the set than outside (based on proximity/closeness, density). Roles and communities are fundamentally different but important complementary notions. Recently, the notion of roles has become increasingly important and has gained a lot of attention due to the proliferation of work on learning representations (node/edge embeddings) from graphs that preserve the notion of roles. Unfortunately, recent work has sometimes confused the notion of roles and communities leading to misleading or incorrect claims about the capabilities of network embedding methods. As such, this manuscript seeks to clarify the differences between roles and communities, and formalize the general mechanisms (e.g., random walks, feature diffusion) that give rise to community or role-based embeddings. We show mathematically why embedding methods based on these identified mechanisms are either community or role-based. These mechanisms are typically easy to identify and can help researchers quickly determine whether a method is more prone to learn community or role-based embeddings. Furthermore, they also serve as a basis for developing new and better methods for community or role-based embeddings. Finally, we analyze and discuss the applications and data characteristics where community or role-based embeddings are most appropriate.}, keywords={Role-based embedding, roles, structural similarity, community-based embedding, communities, proximity, role discovery, positions, structural embeddings, structural roles, structural equivalence, node embeddings, network representation learning, feature learning, graphlets}, active={true}, } @inproceedings{ahmed19-temporal-network-sampling, author={Nesreen K. Ahmed and Nick Duffield and Ryan A. Rossi}, title={Temporal Network Sampling}, booktitle={arXiv:1910.08657}, year={2019}, pages={}, acceptance={}, pdf={pubs/temporal-network-sampling.pdf}, preprint={https://arxiv.org/pdf/1910.08657.pdf}, supp={}, pubtype={}, abstract={Temporal networks representing a stream of timestamped edges are seemingly ubiquitous in the real-world. However, the massive size and continuous nature of these networks make them fundamentally challenging to analyze and leverage for descriptive and predictive modeling tasks. In this work, we propose a general framework for temporal network sampling with unbiased estimation. We develop online, single-pass sampling algorithms and unbiased estimators for temporal network sampling. The proposed algorithms enable fast, accurate, and memory-efficient statistical estimation of temporal network patterns and properties. In addition, we propose a temporally decaying sampling algorithm with unbiased estimators for studying networks that evolve in continuous time, where the strength of links is a function of time, and the motif patterns are temporally-weighted. In contrast to the prior notion of a $\Delta t$-temporal motif, the proposed formulation and algorithms for counting temporally weighted motifs are useful for forecasting tasks in networks such as predicting future links, or a future time-series variable of nodes and links. Finally, extensive experiments on a variety of temporal networks from different domains demonstrate the effectiveness of the proposed algorithms.}, keywords={Temporal network sampling, online sampling, unbiased estimation, temporal patterns, temporal statistics, temporal link decay, temporal weighted motifs temporally decayed motif counts, temporal networks, edge streams, online algorithms, dynamic networks, streaming algorithms}, active={true}, } @inproceedings{rossi-wsdm20, author={Ryan A. Rossi and Nesreen K. Ahmed and Eunyee Koh and Sungchul Kim and Anup Rao and Yasin Abbasi-Yadkori}, title={A Structural Graph Representation Learning Framework}, booktitle={Proceedings of the ACM International Conference on Web Search and Data Mining (WSDM)}, year={2020}, pages={1--9}, pdf={pubs/WSDM20-structural-node-embedding-framework.pdf}, preprint={https://arxiv.org/pdf/1801.09303.pdf}, supp={https://arxiv.org/pdf/1801.09303.pdf}, slides={talks/WSDM20-HONE-talk.pdf}, pubtype={conference}, abstract={The success of many graph-based machine learning tasks highly depends on an appropriate representation learned from the graph data. Most work has focused on learning node embeddings that preserve proximity as opposed to structural role-based embeddings that preserve the structural similarity among nodes. These methods fail to capture higher-order structural dependencies and connectivity patterns that are crucial for structural role-based applications such as visitor stitching from web logs. In this work, we formulate higher-order network representation learning and describe a general framework called HONE for learning such structural node embeddings from networks via the subgraph patterns (network motifs, graphlet orbits/positions) in a nodes neighborhood. A general diffusion mechanism is introduced in HONE along with a space-efficient approach that avoids explicit construction of the k-step motif-based matrices using a k-step linear operator. Furthermore, HONE is shown to be fast and efficient with a worst-case time complexity that is nearly-linear in the number of edges. The experiments demonstrate the effectiveness of HONE for a number of important tasks including link prediction and visitor stitching from large web log data.}, keywords={ Structural node embeddings, role-based embeddings, structural similarity, roles, network motifs, graphlets, structural embeddings, higher-order network embeddings, induced subgraph patterns, orbits, higher-order network analysis, network representation learning, graph feature learning, graph representation learning, node embeddings, higher-order graph features, inductive graph features, attributed graphs, feature diffusion, higher-order graph embeddings, higher-order organization, higher-order network modules, higher-order connectivity patterns, HONE, deep learning}, active={true}, } @inproceedings{gromit-www20, author={Gromit Yeuk-Yin Chan and Fan Du and Ryan A. Rossi and Anup Rao and Eunyee Koh and Cl\'{a}udio T. Silva and Juliana Freire}, title={Real-Time Clustering for Large Sparse Online Visitor Data}, booktitle={Proceedings of The Web Conference (WWW)}, year={2020}, pages={1--11}, pdf={pubs/WWW20-real-time-clustering-for-large-sparse-online-visitor-data.pdf}, supp={}, location={Taipei, Taiwan}, pubtype={conference}, abstract={Online visitor behaviors are often modeled as a large sparse matrix, where rows represent visitors and columns represent behavior. To discover customer segments with different hierarchies, marketers often need to cluster the data in different splits. Such analyses require the clustering algorithm to provide real-time responses on user parameter changes, which the current techniques cannot support. In this paper, we propose a real-time clustering algorithm, sparse density peaks, for large-scale sparse data. It first pre-processes the input points to compute annotations for cluster assignment. While the assignment is only a single scan of the points, a naive pre-processing requires measuring all pairwise distances, which incur a quadratic computation overhead and is infeasible for any moderately sized data. Thus, we propose a new approach based on MinHash and LSH that provides fast and accurate estimations. We also describe an efficient implementation on Spark that addresses data skew and memory usage. Our experiments show that our approach (1) provides a better approximation compared to a straightforward MinHash and LSH implementation in terms of accuracy on real datasets, (2) achieves a 20 $\times$ speedup in the end-to-end clustering pipeline, and (3) can maintain computations with a small memory. Finally, we present an interface to explore customer segments from millions of online visitor records in real-time.}, keywords={Clustering, Sparse binary data, Density peaks, Sketching, Spark}, active={true}, } @inproceedings{typed-graphlet-estimation-www20, author={Ryan A. Rossi and Anup Rao and Tung Mai and Nesreen K. Ahmed}, title={Fast and Accurate Estimation of Typed Graphlets}, booktitle={Proceedings of The Web Conference (WWW)}, year={2020}, pages={}, pdf={pubs/WWW20-typed-graphlet-estimation.pdf}, supp={}, slides={slides/Rossi_WWW20_Typed_Graphlet_Estimation.pdf}, location={Taipei, Taiwan}, pubtype={conference}, abstract={Typed graphlets are small typed (labeled, colored) induced subgraphs and were recently shown to be the fundamental building blocks of rich complex heterogeneous networks. In many applications, speed is more important than accuracy, and it is sufficient to trade-off a tiny amount of accuracy for a significantly faster method. In this work, we propose fast and accurate estimators for typed graphlets. The typed graphlet estimation techniques naturally support general heterogeneous graphs with any arbitrary number of types, which include bipartite, k-partite, k-star, labeled graphs, and attributed networks as special cases. The experiments demonstrate the effectiveness of the typed graphlet estimation techniques.}, keywords={Estimation methods, typed graphlet count estimation, provable guarantees, typed graphlets, heterogeneous network motifs, heterogeneous graphlets, estimation, fast approximation methods}, active={true}, } @inproceedings{motif-closures-www20, author={Ryan A. Rossi and Anup Rao and Sungchul Kim and Eunyee Koh and Nesreen K. Ahmed}, title={From Closing Triangles to Closing Higher-Order Motifs}, booktitle={Proceedings of The Web Conference (WWW)}, year={2020}, pages={}, pdf={pubs/WWW20-motif-closures.pdf}, supp={}, slides={slides/Rossi-WWW20-Motif-Closures.pdf}, location={Taipei, Taiwan}, pubtype={conference}, abstract={This work introduces higher-order ranking and link prediction methods based on closing higher-order network motifs. In particular, we propose the general notion of a motif closure that goes beyond simple triangle closures and demonstrate that these new motif closures often outperform triangle-based methods. This result implies that one should consider other motif closures beyond simple triangles. We also find that the ``best'' motif closure depends highly on the underlying network and its structural properties. Furthermore, the methods are fast and efficient for real-time applications such as online visitor stitching, web search, and recommendation. The experimental results indicate the importance of these new motif closures. Finally, the new motif closures can serve as a basis for developing better (un)supervised ranking/link prediction methods.}, keywords={Motif closure, higher-order motif closure, higher-order ranking, online link prediction, network motifs, graphlets, real-time algorithms}, active={true}, } @inproceedings{hierarchical-clustering-www20, author={Ryan A. Rossi and Nesreen K. Ahmed and Eunyee Koh and Sungchul Kim}, title={Fast Hierarchical Graph Clustering in Linear-Time}, booktitle={Proceedings of The Web Conference (WWW)}, year={2020}, pages={}, pdf={pubs/WWW20-hierarchical-clustering.pdf}, supp={}, slides={slides/Rossi-WWW20-Hierarchical-Graph-Clustering-in-Linear-Time.pdf}, location={Taipei, Taiwan}, pubtype={conference}, abstract={While there has been a lot of research on graph clustering (community detection), most work (i) does not address the hierarchical community detection problem or are (ii) inefficient for large networks. In this work, we describe an approach called hLP that addresses both these limitations. Notably, hLP is fast and efficient for discovering a hierarchy of communities in large networks with a worst-case time and space complexity that is linear in the number of edges and nodes, respectively. The experiments demonstrate the effectiveness of hLP. Finally, we show an application for visualizing large networks with hundreds of thousands of nodes and edges.}, keywords={Hierarchical clustering, hierarchical community detection, linear-time algorithms, fast algorithms}, active={true}, } @inproceedings{farhadi2020stream-matching, title={Approximate Maximum Matching in Random Streams}, author={Alireza Farhadi and MohammadTaghi Hajiaghayi and Tung Mai and Anup Rao and Ryan A. Rossi}, booktitle={Proceedings of the Thirty-First Annual ACM-SIAM Symposium on Discrete Algorithms (SODA)}, pages={1--20}, year={2020}, organization={SIAM}, acceptance={}, preprint={pubs/soda20.pdf}, pdf={pubs/soda20.pdf}, supp={}, doi={}, slides={}, location={}, date={}, pubtype={conference}, abstract={In this paper, we study the problem of finding a maximum matching in the semi-streaming model when edges arrive in random order. In the semi-streaming model, an algorithm receives a stream of edges and it is allowed to have a memory of $\tilde{O}(n)$ where $n$ is the number of vertices in the graph. A recent work shows that there exists a streaming algorithm with the approximation ratio of $\frac{2}{3}$ that uses $\tilde{O}(n^{1.5})$ memory. However, their memory is much larger than the memory constraint of the semi-streaming algorithms. In this work, we further investigate this problem in the semi-streaming model, and we give the first better-than-$0.5$ approximation algorithm in the semi-streaming model. Our main results are as follow. We show that there exists a single-pass deterministic semi-streaming algorithm that finds a $\frac{3}{5} (= 0.6)$ approximation of the maximum matching in bipartite graphs using $\tilde{O}(n)$ memory. This result outperforms the state-of-the-art result that finds a $0.539$ approximation of the maximum matching using $\tilde{O}(n)$ memory. By giving a black-box reduction from finding a matching in general graphs to finding a matching in bipartite graphs, we show there exists a single-pass deterministic semi-streaming algorithm that finds a $\frac{6}{11} (\approx 0.545)$ approximation of the maximum matching in general graphs, improving upon the state-of-art result $0.506$ approximation. Our work is the first better-than-$0.5$ approximation algorithm for this problem in the semi-streaming model.}, keywords={Matching, maximum matching, stream matching, streaming, semi-streaming, graph theory, sublinear algorithms, approximation algorithms, property testing}, active={true}, } @inproceedings{causality-aaai20, author={Ryan A. Rossi and Somdeb Sarkhel and Nesreen K. Ahmed}, title={Inferring Individual Level Causal Models from Graph-based Relational Time Series}, booktitle={AAAI StarAI}, year={2020}, pages={}, acceptance={}, preprint={}, arxivpdf={https://arxiv.org/pdf/2001.05993.pdf}, pdf={pubs/aaai20-relational-time-series-causal-inference.pdf}, supp={}, doi={}, slides={poster/AAAI20-StarAI-poster.pdf}, poster={poster/AAAI20-StarAI-poster.pdf}, location={}, date={}, pubtype={}, abstract={In this work, we formalize the problem of causal inference over graph-based relational time-series data where each node in the graph has one or more time-series associated to it. We propose causal inference models for this problem that leverage both the graph topology and time-series to accurately estimate local causal effects of nodes. Furthermore, the relational time-series causal inference models are able to estimate local effects for individual nodes by exploiting local node-centric temporal dependencies \emph{and} topological/structural dependencies. We show that simpler causal models that do not consider the graph topology are recovered as special cases of the proposed relational time-series causal inference model. We describe the conditions under which the resulting estimate can be used to estimate a causal effect, and describe how the Durbin-Wu-Hausman test of specification can be used to test for the consistency of the proposed estimator from data. Empirically, we demonstrate the effectiveness of the causal inference models on both synthetic data with known ground-truth and a large-scale observational relational time-series data set collected from Wikipedia.}, keywords={Causal inference, causality, causal inference models, relational time-series causal inference, relational time-series, graph time-series}, active={true}, } @inproceedings{lee19-motif-attention, author={John Boaz Lee and Ryan Rossi and Xiangnan Kong and Sungchul Kim and Eunyee Koh and Anup Rao}, title={Graph Convolutional Networks with Motif-based Attention}, booktitle={28th ACM International Conference on Information and Knowledge Management (CIKM)}, year={2019}, pages={}, acceptance={20%}, pdf={pubs/motif-attention-CIKM19.pdf}, preprint={}, supp={}, doi={}, slides={}, location={Beijing, China}, pubtype={conference}, abstract={The success of deep convolutional neural networks in the domains of computer vision and speech recognition has led researchers to investigate generalizations of the said architecture to graph-structured data. A recently-proposed method called Graph Convolutional Networks has been able to achieve state-of-the-art results in the task of node classification. However, since the proposed method relies on localized first-order approximations of spectral graph convolutions, it is unable to capture higher-order interactions between nodes in the graph. In this work, we propose a motif-based graph attention model, called Motif Convolutional Networks, which generalizes past approaches by using weighted multi-hop motif adjacency matrices to capture higher-order neighborhoods. A novel attention mechanism is used to allow each individual node to select the most relevant neighborhood to apply its filter. We evaluate our approach on graphs from different domains (social networks and bioinformatics) with results showing that it is able to outperform a set of competitive baselines on the semi-supervised node classification task. Additional results demonstrate the usefulness of attention, showing that different higher-order neighborhoods are prioritized by different kinds of nodes.}, keywords={graph attention, motifs, graph convolution, reinforcement learning, higher-order network motifs, structural role, deep learning}, active={true}, } @inproceedings{mai-uai19, author={Tung Mai and Anup Rao and Matt Kapilevich and Ryan A. Rossi and Yasin Abbasi-Yadkori and Ritwik Sinha}, title={On Densification for Minwise Hashing}, booktitle={UAI}, year={2019}, pages={}, acceptance={26%}, pdf={pubs/UAI19-MinHash-Densification.pdf}, preprint={http://auai.org/uai2019/proceedings/papers/302.pdf}, supp={}, doi={}, slides={}, location={Tel Aviv, Israel}, pubtype={conference}, abstract={One Permutation Hashing (OPH) is a significantly more efficient alternative to the popular minwise hashing. To produce a sketch of size $k$, OPH requires just one hash function whereas the classical minwise hashing requires $k$ hash functions. However, OPH does not have the desirable locality sensitive hashing (LSH) property that is important for indexing. [Srivastava and Li 2014, ICML] proposed the novel idea of densification to produce LSH sketches from OPH sketches, and gave the first densification routine. In this paper, we give a necessary and sufficient condition for a densification routine to result in LSH sketches when applied to OPH sketches. Furthermore, we give a novel densification routine that for every input, takes $O(k \log k)$ time in expectation and achieves better variance than the previous best bound obtained by [Srivastava 2017, ICML]. The running time of the densification routine given in [Srivastava 2017, ICML] for worst case inputs is $O(k^2)$ in expectation.}, keywords={Sketching, one permutation hashing, densification, locality-sensitive hashing}, active={true}, } @article{rossi2019complex, title={Complex networks are structurally distinguishable by domain}, author={Ryan A. Rossi and Nesreen K. Ahmed}, journal={Social Network Analysis and Mining (SNAM)}, volume={9}, number={1}, pages={51}, year={2019}, publisher={Springer}, pdf={pubs/SNAM19-rossi-et-al.pdf}, preprint={}, slides={}, pubtype={journal}, abstract={Complex networks arise in many domains and often represent phenomena such as brain activity, social relationships, molecular interactions, hyperlinks, and re-tweets. In this work, we study the problem of predicting the category (domain) of arbitrary networks. This includes complex networks from different domains as well as synthetically generated graphs from six different network models. We formulate this problem as a multiclass classification problem and learn a model to predict the domain of a new previously unseen network using only a small set of simple structural features. The model is able to accurately predict the domain of arbitrary networks from 17 different domains with 95.7% accuracy. This work makes two important findings. First, our results indicate that complex networks from various domains have distinct structural properties that allow us to predict with high accuracy the category of a new previously unseen network. Second, synthetic graphs are trivial to classify as the classification model can predict with near-certainty the graph model used to generate it. Overall, the results demonstrate that networks drawn from different domains and graph models are distinguishable using a few simple structural features.}, keywords={Network categorization, network domain classification, structural distinguishability, network domain distinguishability, graph classification, structural properties, network science, complex networks, network classification, machine learning}, active={true}, } @article{rossi2019complex, title={Complex networks are structurally distinguishable by domain}, author={Ryan A. Rossi and Nesreen K. Ahmed}, journal={Social Network Analysis and Mining (SNAM)}, volume={9}, number={1}, pages={51}, year={2019}, publisher={Springer}, pdf={pubs/SNAM19-rossi-et-al.pdf}, preprint={}, supp={https://link.springer.com/article/10.1007/s13278-019-0593-7}, doi={https://doi.org/10.1007/s13278-019-0593-7}, slides={}, pubtype={journal}, abstract={Complex networks arise in many domains and often represent phenomena such as brain activity, social relationships, molecular interactions, hyperlinks, and re-tweets. In this work, we study the problem of predicting the category (domain) of arbitrary networks. This includes complex networks from different domains as well as synthetically generated graphs from six different network models. We formulate this problem as a multiclass classification problem and learn a model to predict the domain of a new previously unseen network using only a small set of simple structural features. The model is able to accurately predict the domain of arbitrary networks from 17 different domains with 95.7% accuracy. This work makes two important findings. First, our results indicate that complex networks from various domains have distinct structural properties that allow us to predict with high accuracy the category of a new previously unseen network. Second, synthetic graphs are trivial to classify as the classification model can predict with near-certainty the graph model used to generate it. Overall, the results demonstrate that networks drawn from different domains and graph models are distinguishable using a few simple structural features.}, keywords={Network categorization, network domain classification, structural distinguishability, network domain distinguishability, graph classification, structural properties, network science, complex networks, network classification, machine learning}, active={false}, } @inproceedings{ctdne-journal, author={John Boaz Lee and Giang Nguyen and Ryan A. Rossi and Nesreen K. Ahmed and Eunyee Koh and Sungchul Kim}, title={Temporal Network Representation Learning}, booktitle={arXiv}, year={2019}, pages={}, acceptance={}, pdf={pubs/CTDNE-extended.pdf}, preprint={https://arxiv.org/pdf/1904.06449.pdf}, supp={}, doi={}, slides={}, pubtype={journal}, abstract={Networks evolve continuously over time with the addition, deletion, and changing of links and nodes. Such temporal networks (or edge streams) consist of a sequence of timestamped edges and are seemingly ubiquitous. Despite the importance of accurately modeling the temporal information, most embedding methods ignore it entirely or approximate the temporal network using a sequence of static snapshot graphs. In this work, we introduce the notion of temporal walks for learning dynamic embeddings from temporal networks. Temporal walks capture the temporally valid interactions (e.g., flow of information, spread of disease) in the dynamic network in a lossless fashion. Based on the notion of temporal walks, we describe a general class of embeddings called continuous-time dynamic network embeddings (CTDNEs) that completely avoid the issues and problems that arise when approximating the temporal network as a sequence of static snapshot graphs. Unlike previous work, CTDNEs learn dynamic node embeddings directly from the temporal network at the finest temporal granularity and thus use only temporally valid information. As such CTDNEs naturally support online learning of the node embeddings in a streaming real-time fashion. The experiments demonstrate the effectiveness of this class of embedding methods for prediction in temporal networks.}, keywords={temporal random walks, dynamic network embeddings, temporal network embeddings, continious-time dynamic network embeddings, temporal network representation learning, dynamic graph representation learning, node embeddings, dynamic networks, temporal networks, temporal graphs, temporal walks, dynamic walks, stream walks, graph streams, continuous-time dynamic networks}, active={true}, } @inproceedings{latent-network-summ-kdd19, author={Di Jin and Ryan A. Rossi and Eunyee Koh and Sungchul Kim and Anup Rao and Danai Koutra}, title={Latent Network Summarization: Bridging Network Embedding and Summarization}, booktitle={KDD}, year={2019}, pages={}, acceptance={}, pdf={pubs/latent-network-summarization-KDD19.pdf}, preprint={https://arxiv.org/abs/1811.04461}, supp={}, pubtype={conference}, abstract={An important reason behind the prevalence of node representation learning is their superiority in downstream machine learning tasks on graphs. However, storing the vector-based node representation of massive real-world graphs often requires space that is orders of magnitude larger. To alleviate this issue, we introduce the problem of latent network summarization that is complementary to the problem of network embedding and propose a general framework called Multi-LENS. Instead of deriving dense node-wise representation, the goal of latent network summarization is to summarize the structural properties of the graph in a latent space with dimensionality that is independent of the nodes or edges in the input graph. The size-independent graph summary given by Multi-LENS consists of (i) a set of relational aggregators with their compositions (relational functions) that captures structural features of multi-order node-centric subgraphs, and (ii) the low-rank approximations to the induced matrices incorporating graph heterogeneity. In addition, Multi-LENS is able to derive node embeddings on the fly from this latent summarization due to its inductive properties. Multi-LENS bridges advantages brought by both embeddings and graph summarization, and applies to graphs with or without directionality, weights, attributes or labels. Extensive experiments on both synthetic and real-world graphs show that Multi-LENS achieves 2-89\% improvement in AUC for link prediction, while requiring less than 79x space compared to existing representation learning approaches. We also show the effectiveness of Multi-LENS summaries in detecting anomalies and events in the Enron email communication graph and Twitter co-mention graph.}, keywords={graph summarization, graph compression, heterogeneous networks, structural similarity, latent network summarization}, active={true}, } @inproceedings{chen20-fig-caption-generation, author={Charles Chen and Ruiyi Zhang and Eunyee Koh and Sungchul Kim and Scott Cohen and Ryan A. Rossi}, title={Figure Captioning with Reasoning and Sequence-Level Training}, booktitle={IEEE Winter Conference on Applications of Computer Vision (WACV)}, year={2020}, pages={}, acceptance={}, preprint={pubs/figure-caption-generation-WACV20.pdf}, pdf={pubs/figure-caption-generation-WACV20.pdf}, supp={}, doi={}, slides={}, location={}, date={}, pubtype={}, abstract={Figures, such as line plots, pie charts, bar charts, are widely used to convey important information in a concise format. In this work, we investigate the problem of figure caption generation where the goal is to automatically generate a natural language description for a given figure. While natural image captioning has been studied extensively, figure captioning has received relatively little attention and remains a challenging problem. A successful solution to this task has many potential applications, such as: 1) adding captions to the output of a visualization tool; 2) summarizing documents with a number of figures with or without proper captions; 3) improving user experience by allowing figure content to be accessible to those with visual impairment. To solve this problem, we collect a dataset FigCAP for testing the capability of generating captions, and propose a captioning framework with novel attention models. In order to solve the exposure bias issue, we further train the captioning model with sequence-level policy based on reinforcement learning, which directly optimizes evaluation metrics. Extensive experiments show that our proposed models outperform strong image captioning baselines, thus demonstrating a significant potential for automatic generating captions for figures.}, keywords={Figure caption generation, sequence-level training, caption generation, computer vision, deep learning}, active={true}, } @inproceedings{rossi-heterogeneous-graphlets, author={Ryan A. Rossi and Nesreen K. Ahmed and Aldo Carranza and David Arbour and Anup Rao and Sungchul Kim and Eunyee Koh}, title={Heterogeneous Graphlets}, booktitle={MLG KDD}, year={2019}, pages={8}, pdf={pubs/heterogeneous-graphlets.pdf}, preprint={}, supp={}, slides={slides/KDD19-Heterogeneous-Graphlets.pdf}, slideshare={}, poster={poster/KDD19-Heterogeneous-Graphlets.pdf}, pubtype={workshop}, abstract={In this work, we generalize the notion of network motifs (graphlets) to heterogeneous networks by introducing the notion of a small induced typed subgraph called typed graphlet. Typed graphlets generalize graphlets to rich heterogeneous networks as they explicitly capture the higher-order typed connectivity patterns in such networks. To address this problem, we describe a general framework for counting the occurrences of such typed graphlets. The proposed algorithms leverage a number of combinatorial relationships for different typed graphlets. For each edge, we count a few typed graphlets, and with these counts along with the combinatorial relationships, we obtain the exact counts of the other typed graphlets in o(1) constant time. Notably, the worst-case time complexity of the proposed approach matches the best known untyped algorithm. In addition, the approach lends itself to an efficient lock-free and asynchronous parallelization. The experiments confirm the approach is orders of magnitude faster and more space-efficient than existing methods. Unlike existing methods that take hours on small networks, the proposed approach takes only seconds on large networks with millions of edges. This gives rise to new opportunities and applications for typed graphlets on large real-world networks.}, keywords={Heterogeneous graphlets, colored network motifs, typed motifs, heterogeneous graphlets, typed graphlets, colored graphlets, colored motifs, colored network motifs, network motifs, heterogeneous networks, labeled graphs, signed networks, k-partite graphs}, active={true}, } @inproceedings{node2bits-ECML19, title={{Node2BITS: Compact Time- and Attribute-aware Node Representations for User Stitching}}, author={Di Jin and Mark Heimann and Ryan A. Rossi and Danai Koutra}, booktitle={ECML/PKDD}, pdf={pubs/node2bits.pdf}, preprint={https://arxiv.org/pdf/1904.08572.pdf}, pages={22}, year={2019}, pubtype={conference}, abstract={Identity stitching, the task of identifying and matching various online references (e.g., sessions over different devices and timespans) to the same user in real-world web services, is crucial for personalization and recommendations. However, traditional user stitching approaches, such as grouping or blocking, require quadratic pairwise comparisons between a massive number of user activities, thus posing both computational and storage challenges. Recent works, which are often application-specific, heuristically seek to reduce the amount of comparisons, but they suffer from low precision and recall. To solve the problem in an application-independent way, we take a heterogeneous network-based approach in which users (nodes) interact with content (e.g., sessions, websites), and may have attributes (e.g., location). We propose node2bits, an efficient framework that represents multi-dimensional features of node contexts with binary hashcodes. node2bits leverages feature-based temporal walks to encapsulate short- and long-term interactions between nodes in heterogeneous web networks, and adopts SimHash to obtain compact, binary representations and avoid the quadratic complexity for similarity search. Extensive experiments on large-scale real networks show that node2bits outperforms traditional techniques and existing works that generate real-valued embeddings by up to 5.16% in F1 score on user stitching, while taking only up to 1.56% as much storage.}, keywords={Hash-based network representation learning, space-efficient embeddings, binary network embeddings, temporal feature-based walks, temporal walks, dynamic network embedding}, active={true}, } @inproceedings{rossi-TKDE18, author={Ryan A. Rossi and Rong Zhou and Nesreen K. Ahmed}, title={Deep Inductive Graph Representation Learning}, booktitle={IEEE Transactions on Knowledge and Data Engineering (TKDE)}, year={2018}, pages={14}, pdf={pubs/deepGL.pdf}, preprint={}, supp={}, pubtype={journal}, abstract={This paper presents a general inductive graph representation learning framework called DeepGL for learning deep node and edge features that generalize across-networks. In particular, DeepGL begins by deriving a set of base features from the graph (e.g., graphlet features) and automatically learns a multi-layered hierarchical graph representation where each successive layer leverages the output from the previous layer to learn features of a higher-order. Contrary to previous work, DeepGL learns relational functions (each representing a feature) that naturally generalize across-networks and are therefore useful for graph-based transfer learning tasks. Moreover, DeepGL naturally supports attributed graphs, learns interpretable inductive graph representations, and is space-efficient (by learning sparse feature vectors). In addition, DeepGL is expressive, flexible with many interchangeable components, efficient with a time complexity of $\mathcal{O}(|E|)$, and scalable for large networks via an efficient parallel implementation. Compared with recent methods, DeepGL is (1) effective for across-network transfer learning tasks and large (attributed) graphs, (2) space-efficient requiring up to 6x less memory, (3) fast with up to 106x speedup in runtime performance, and (4) accurate with an average improvement in AUC of 20\% or more on many learning tasks and across a wide variety of networks.}, keywords={Inductive network representation learning, compositional network embedding, inductive learning, network representation learning, graph feature learning, graph representation learning, deep graph features, relational functions, relational feature operators, higher-order features, transfer learning, inductive graph features, attributed graphs, node/edge features, node feature learning, edge feature learning, relational deep learning, deep graph learning, hierarchical graph representation, feature diffusion, graphlets, network motifs, induced subgraph patterns, deep learning}, active={true}, } @inproceedings{socc19-lqr-cloud, author={Youngsuk Park and Kanak Mahadik and Ryan A. Rossi and Nesreen K. Ahmed and Gang Wu and Handong Zhao}, title={Linear Quadratic Regulator for Resource-Efficient Cloud Services}, booktitle={Proceedings of the ACM Symposium on Cloud Computing (SoCC)}, year={2019}, pages={488–489}, acceptance={}, pdf={pubs/SoCC19-LQR-Cloud-Optimization.pdf}, doi={https://doi.org/10.1145/3357223.3366028}, pubtype={}, abstract={}, keywords={Linear Quadratic Regulator, resource-efficient cloud services, cloud optimization, resource allocation, control}, active={true}, } @inproceedings{rossi19-hLP, author={Ryan A. Rossi and Nesreen K. Ahmed and Eunyee Koh and Sungchul Kim}, title={Linear-time Hierarchical Community Detection}, booktitle={arXiv:1906.06432}, year={2019}, pages={}, acceptance={}, pdf={pubs/rossi19-hLP.pdf}, preprint={https://arxiv.org/pdf/1906.06432.pdf}, supp={https://arxiv.org/abs/1906.06432}, pubtype={}, abstract={Community detection in graphs has many important and fundamental applications including in distributed systems, compression, image segmentation, divide-and-conquer graph algorithms such as nested dissection, document and word clustering, circuit design, among many others. Finding these densely connected regions of graphs remains an important and challenging problem. Most work has focused on scaling up existing methods to handle large graphs. These methods often partition the graph into two or more communities. In this work, we focus on the problem of hierarchical community detection (i.e., finding a hierarchy of dense community structures going from the lowest granularity to the largest) and describe an approach that runs in linear time with respect to the number of edges and thus fast and efficient for large-scale networks. The experiments demonstrate the effectiveness of the approach quantitatively. Finally, we show an application of it for visualizing large networks with hundreds of thousands of nodes/links.}, keywords={Community detection, hierarchical communities, linear-time algorithms, label propagation, graph clustering, graph mining}, active={true}, } @inproceedings{rossi19-motif-closures, author={Ryan A. Rossi and Anup Rao and Sungchul Kim and Eunyee Koh and Nesreen K. Ahmed and Gang Wu}, title={Higher-Order Ranking and Link Prediction: From Closing Triangles to Closing Higher-Order Motifs}, booktitle={arXiv:1906.05059}, year={2019}, pages={}, acceptance={}, pdf={pubs/rossi19-motif-closures.pdf}, preprint={https://arxiv.org/pdf/1906.05059.pdf}, supp={https://arxiv.org/abs/1906.05059}, pubtype={}, abstract={In this paper, we introduce the notion of motif closure and describe higher-order ranking and link prediction methods based on the notion of closing higher-order network motifs. The methods are fast and efficient for real-time ranking and link prediction-based applications such as web search, online advertising, and recommendation. In such applications, real-time performance is critical. The proposed methods do not require any explicit training data, nor do they derive an embedding from the graph data, or perform any explicit learning. Existing methods with the above desired properties are all based on closing triangles (common neighbors, Jaccard similarity, and the ilk). In this work, we investigate higher-order network motifs and develop techniques based onthe notion of closing higher-order motifs that move beyond closing simple triangles. All methods described in this work are fast with a runtime that is sublinear in the number of nodes. The experimental results indicate the importance of closing higher-order motifs for ranking and link prediction applications. Finally, the proposed notion of higher-order motif closure can serve as a basis for studying and developing better ranking and link prediction methods.}, keywords={Motif closure, higher-order ranking, online link prediction, network motifs, graphlets, motif closure, link prediction, link estimation, online recommendation}, active={true}, } @inproceedings{jin18-latent-network-summ, author={Di Jin and Ryan A. Rossi and Danai Koutra and Eunyee Koh and Sungchul Kim and Anup Rao}, title={Bridging Network Embedding and Graph Summarization}, booktitle={arXiv:1811.04461}, year={2018}, pages={}, acceptance={}, pdf={pubs/Jin-et-al-latent-network-summarization.pdf}, preprint={https://arxiv.org/abs/1811.04461}, supp={}, pubtype={}, abstract={An important reason behind the prevalence of node representation learning is their superiority in downstream machine learning tasks on graphs. However, storing the vector-based node representation of massive real-world graphs often requires space that is orders of magnitude larger. To alleviate this issue, we introduce the problem of latent network summarization that is complementary to the problem of network embedding and propose a general framework called Multi-LENS. Instead of deriving dense node-wise representation, the goal of latent network summarization is to summarize the structural properties of the graph in a latent space with dimensionality that is independent of the nodes or edges in the input graph. The size-independent graph summary given by Multi-LENS consists of (i) a set of relational aggregators with their compositions (relational functions) that captures structural features of multi-order node-centric subgraphs, and (ii) the low-rank approximations to the induced matrices incorporating graph heterogeneity. In addition, Multi-LENS is able to derive node embeddings on the fly from this latent summarization due to its inductive properties. Multi-LENS bridges advantages brought by both embeddings and graph summarization, and applies to graphs with or without directionality, weights, attributes or labels. Extensive experiments on both synthetic and real-world graphs show that Multi-LENS achieves 2-89\% improvement in AUC for link prediction, while requiring less than 79x space compared to existing representation learning approaches. We also show the effectiveness of Multi-LENS summaries in detecting anomalies and events in the Enron email communication graph and Twitter co-mention graph.}, keywords={graph summarization, graph compression, heterogeneous networks, structural similarity, latent network summarization}, active={true}, } @inproceedings{lee18-attention-survey, title={Attention Models in Graphs: A Survey}, author={John Boaz Lee and Ryan A. Rossi and Sungchul Kim and Nesreen K. Ahmed and Eunyee Koh}, booktitle={Transactions on Knowledge Discovery from Data (TKDD)}, preprintNote={arXiv:1807.07984}, pages={19}, year={2019}, pdf={pubs/attention-models-in-graphs-preprint.pdf}, preprint={https://arxiv.org/pdf/1807.07984.pdf}, pubtype={journal}, abstract={Graph-structured data arise naturally in many different application domains. By representing data as graphs, we can capture entities (i.e., nodes) as well as their relationships (i.e., edges) with each other. Many useful insights can be derived from graph-structured data as demonstrated by an ever-growing body of work focused on graph mining. However, in the real-world, graphs can be both large - with many complex patterns - and noisy which can pose a problem for effective graph mining. An effective way to deal with this issue is to incorporate "attention" into graph mining solutions. An attention mechanism allows a method to focus on task-relevant parts of the graph, helping it to make better decisions. In this work, we conduct a comprehensive and focused survey of the literature on the emerging field of graph attention models. We introduce three intuitive taxonomies to group existing work. These are based on problem setting (type of input and output), the type of attention mechanism used, and the task (e.g., graph classification, link prediction, etc.). We motivate our taxonomies through detailed examples and use each to survey competing approaches from a unique standpoint. Finally, we highlight several challenges in the area and discuss promising directions for future work.}, keywords={Attentional processing, graph attention, attention model, deep graph models, reinforcement learning, recurrent neural networks, graph representation, representation learning, graph embeddings, graph feature learning, graph classification, bioinformatics, deep learning, graph-based deep learning, machine learning}, active={true}, } @inproceedings{kim-wsdm19, title={Domain Switch-Aware Holistic Recurrent Neural Network for Modeling Multi-Domain User Behavior}, author={Donghyun Kim and Sungchul Kim and Handong Zhao and Sheng Li and Ryan A. Rossi and Eunyee Koh}, booktitle={WSDM}, pages={9}, year={2019}, pdf={pubs/kim-et-al-WSDM19.pdf}, pubtype={conference}, abstract={Understanding user behavior and predicting future behavior on the web is critical for providing seamless user experiences as well as increasing revenue of service providers. Recently, thanks to the remarkable success of recurrent neural networks (RNNs), it has been widely used for modeling sequences of user behaviors. However, although sequential behaviors appear across multiple domains in practice, existing RNN-based approaches still focus on the single-domain scenario assuming that sequential behaviors come from only a single domain. Hence, in order to analyze sequential behaviors across multiple domains, they require to separately train multiple RNN models, which fails to jointly model the interplay among sequential behaviors across multiple domains. Consequently, they often suffer from lack of information within each domain. In this paper, we first introduce a practical but overlooked phenomenon in sequential behaviors across multiple domains, i.e., domain switch where two successive behaviors belong to different domains. Then, we propose a Domain Switch-Aware Holistic Recurrent Neural Network (DS-HRNN) that effectively shares the knowledge extracted from multiple domains by systematically handling domain switch for the multi-domain scenario. DS-HRNN jointly models the multi-domain sequential behaviors and accurately predicts the future behaviors in each domain with only a single RNN model. Our extensive evaluations on two real-world datasets demonstrate that DS-HRNN outperforms existing RNN-based approaches and non-sequential baselines with significant improvements by up to 14.93\% in terms of recall of the future behavior prediction.}, keywords={Multi-domain user behavior, Sequence modeling, Domain switch, Recurrent neural network}, active={true}, } @inproceedings{higher-order-clustering-heter, title={Higher-order Spectral Clustering for Heterogeneous Graphs}, author={Aldo G. Carranza and Ryan A. Rossi and Anup Rao and Eunyee Koh}, booktitle={arXiv:1810.02959}, pages={15}, year={2018}, date={October 6, 2018}, pdf={pubs/Higher-order_Clustering_preprint.pdf}, pubtype={}, abstract={Higher-order connectivity patterns such as small induced sub-graphs called graphlets (network motifs) are vital to understand the important components (modules/functional units) governing the configuration and behavior of complex networks. Existing work in higher-order clustering has focused on simple homogeneous graphs with a single node/edge type. However, heterogeneous graphs consisting of nodes and edges of different types are seemingly ubiquitous in the real-world. In this work, we introduce the notion of typed-graphlet that explicitly captures the rich (typed) connectivity patterns in heterogeneous networks. Using typed-graphlets as a basis, we develop a general principled framework for higher-order clustering in heterogeneous networks. The framework provides mathematical guarantees on the optimality of the higher-order clustering obtained. The experiments demonstrate the effectiveness of the framework quantitatively for three important applications including (i) clustering, (ii) link prediction, and (iii) graph compression. In particular, the approach achieves a mean improvement of 43x over all methods and graphs for clustering while achieving a 18.7% and 20.8% improvement for link prediction and graph compression, respectively.}, keywords={Clustering, higher-order clustering, spectral clustering, typed graphlets, network motifs, community detection, heterogeneous graphs, labeled graphs, graph mining}, active={true}, } @inproceedings{rossi-heterogeneous-motifs, author={Ryan A. Rossi and Nesreen K. Ahmed and Aldo Carranza and David Arbour and Anup Rao and Sungchul Kim and Eunyee Koh}, title={Heterogeneous Network Motifs}, booktitle={arXiv:1901.10026}, year={2019}, pages={18}, pdf={pubs/heterogeneous-network-motifs.pdf}, preprint={https://arxiv.org/abs/1901.10026}, supp={https://arxiv.org/pdf/1901.10026.pdf}, pubtype={}, abstract={Many real-world applications give rise to large heterogeneous networks where nodes and edges can be of any arbitrary type (e.g., user, web page, location). Special cases of such heterogeneous graphs include homogeneous graphs, bipartite, k-partite, signed, labeled graphs, among many others. In this work, we generalize the notion of network motifs to heterogeneous networks. In particular, small induced typed subgraphs called typed graphlets (heterogeneous network motifs) are introduced and shown to be the fundamental building blocks of complex heterogeneous networks. Typed graphlets are a powerful generalization of the notion of graphlet (network motif) to heterogeneous networks as they capture both the induced subgraph of interest and the types associated with the nodes in the induced subgraph. To address this problem, we propose a fast, parallel, and space-efficient framework for counting typed graphlets in large networks. We discover the existence of non-trivial combinatorial relationships between lower-order ($k-1$)-node typed graphlets and leverage them for deriving many of the $k$-node typed graphlets in $o(1)$ constant time. Thus, we avoid explicit enumeration of those typed graphlets. Notably, the time complexity matches the best untyped graphlet counting algorithm. The experiments demonstrate the effectiveness of the proposed framework in terms of runtime, space-efficiency, parallel speedup, and scalability as it is able to handle large-scale networks.}, keywords={Heterogeneous network motifs, typed motifs, heterogeneous graphlets, typed graphlets, colored graphlets, colored motifs, colored network motifs, network motifs, heterogeneous networks, labeled graphs, signed networks, k-partite graphs}, active={true}, } @inproceedings{chen19-neural-fig-caption-generation, author={Charles Chen and Ruiyi Zhang and Sungchul Kim and Eunyee Koh and Scott Cohen and Tong Yu and Ryan A. Rossi and Razvan Bunescu}, title={Neural caption generation over figures}, booktitle={Proceedings of the 2019 ACM International Joint Conference on Pervasive and Ubiquitous Computing (UbiComp/ISWC)}, year={2019}, pages={482--485}, acceptance={}, preprint={}, pdf={}, supp={https://dl.acm.org/citation.cfm?id=3345601&dl=ACM&coll=DL}, doi={}, slides={}, location={}, date={}, pubtype={}, abstract={Figures are human-friendly but difficult for computers to process automatically. In this work, we investigate the problem of figure captioning. The goal is to automatically generate a natural language description of a given figure. We create a new dataset for figure captioning, FigCAP. To achieve accurate generation of labels in figures, we propose the Label Maps Attention Model. Extensive experiments show that our method outperforms the baselines. A successful solution to this task allows figure content to be accessible to those with visual impairment by providing input to a text-to-speech system; and enables automatic parsing of vast repositories of documents where figures are pervasive.}, keywords={Figure caption generation, sequence-level training, caption generation, computer vision, deep learning}, active={true}, } @inproceedings{lee18-kdd-graph-attention, title={Graph Classification using Structural Attention}, author={John Boaz Lee and Ryan A. Rossi and Xiangnan Kong}, booktitle={KDD}, pages={1--9}, year={2018}, pdf={pubs/KDD18-graph-attention-model.pdf}, poster={poster/KDD18.pdf}, pubtype={conference}, abstract={Graph classification is a problem with practical applications in many different domains. To solve this problem, one usually calculates certain graph statistics (i.e., graph features) that help discriminate between graphs of different classes. When calculating such features, most existing approaches process the entire graph. In a graphlet-based approach, for instance, the entire graph is processed to get the total count of different graphlets or subgraphs. In many real-world applications, however, graphs can be noisy with discriminative patterns confined to certain regions in the graph only. In this work, we study the problem of attention-based graph classification. The use of attention allows us to focus on small but informative parts of the graph, avoiding noise in the rest of the graph. We present a novel RNN model, called the Graph Attention Model (GAM), that processes only a portion of the graph by adaptively selecting a sequence of “informative” nodes. The model is equipped with an external memory component which allows it to integrate information gathered from different parts of the graph. Experimental results on multiple real-world datasets show that the proposed method is competitive against various well-known methods in the graph classification task even though these methods have access to the entire graph whereas we limit our method to only a portion of the graph.}, keywords={Attentional processing, graph attention, attention model, deep graph models, reinforcement learning, recurrent neural networks, graph representation, representation learning, graph embeddings, graph feature learning, graph classification, bioinformatics, deep learning, graph-based deep learning, machine learning}, active={true}, } @inproceedings{lee18-higher-order-GCNs, title={Higher-order Graph Convolutional Networks}, author={John Boaz Lee and Ryan A. Rossi and Xiangnan Kong and Sungchul Kim and Eunyee Koh and Anup Rao}, booktitle={arXiv:1809.07697}, pages={1--8}, year={2018}, pdf={pubs/Higher-order-GCNs.pdf}, preprint={}, pubtype={}, abstract={Following the success of deep convolutional networks in various vision and speech related tasks, researchers have started investigating generalizations of the well-known technique for graph-structured data. A recently-proposed method called Graph Convolutional Networks has been able to achieve state-of-the-art results in the task of node classification. However, since the proposed method relies on localized first-order approximations of spectral graph convolutions, it is unable to capture higher-order interactions between nodes in the graph. In this work, we propose a motif-based graph attention model, called Motif Convolutional Networks, which generalizes past approaches by using weighted multi-hop motif adjacency matrices to capture higher-order neighborhoods. A novel attention mechanism is used to allow each individual node to select the most relevant neighborhood to apply its filter. Experiments show that our proposed method is able to achieve state-of-the-art results on the semi-supervised node classification task.}, keywords={Attentional processing, graph attention, attention model, deep graph models, reinforcement learning, recurrent neural networks, graph representation, representation learning, graph embeddings, graph feature learning, graph classification, bioinformatics, deep learning, graph-based deep learning, machine learning}, active={true}, } @inproceedings{ctdne-bigdata18, author={Giang Hoang Nguyen and John Boaz Lee and Ryan A. Rossi and Nesreen K. Ahmed and Eunyee Koh and Sungchul Kim}, title={Dynamic Network Embeddings: From Random Walks to Temporal Random Walks}, booktitle={IEEE BigData}, year={2018}, pages={1085--1092}, acceptance={18.9%}, pdf={pubs/CTDNE-BigData18.pdf}, preprint={http://doi.org/10.1109/BigData.2018.8622109}, supp={}, doi={10.1109/BigData.2018.8622109}, slides={talks/BigData18-CTDNE-talk.pdf}, pubtype={conference}, abstract={Networks evolve continuously over time with the addition, deletion, and changing of links and nodes. Although many networks contain this type of temporal information, the majority of research in network representation learning has focused on static snapshots of the graph and has largely ignored the temporal dynamics of the network. In this work, we describe a general framework for incorporating temporal information into network embedding methods. The framework gives rise to methods for learning time-respecting embeddings from continuous-time dynamic networks. Overall, the experiments demonstrate the effectiveness of the proposed framework and dynamic network embedding approach as it achieves an average gain of 11.9% across all methods and graphs. The results indicate that modeling temporal dependencies in graphs is important for learning appropriate and meaningful network representations.}, keywords={temporal random walks, dynamic network embeddings, temporal network embeddings, continious-time dynamic network embeddings, network representation learning, graph representation learning, node embeddings, dynamic networks, temporal networks, temporal graphs, temporal walks, dynamic walks, graph streams, continious-time dynamic networks}, active={true}, } @inproceedings{rossi18icdm, title={Interactive Higher-order Network Analysis}, author={Ryan A. Rossi and Nesreen K. Ahmed and Eunyee Koh}, booktitle={IEEE International Conference on Data Mining (ICDM)}, pages={6}, year={2018}, submitted={2017}, pdf={pubs/ICDM18-Interactive-Higher-order-Network-Analysis.pdf}, slides={https://youtu.be/VE-GsP4p9n8}, slideshare={}, poster={poster/ICDM18-Poster.pdf}, pubtype={conference}, abstract={Higher-order network modeling and analysis are vital to understanding the structures governing the configuration and behavior of complex networks. While network motifs are known to be fundamental building blocks of complex networks, the higher-order configuration and organization of complex networks remain widely unknown. In this work, we develop interactive visual higher-order network mining and modeling techniques to gain insight into the higher-order structure and composition of complex networks. The approach uncovers higher-order configurations including important phenotypes in a human gene interaction network and hubs in a power grid network.}, keywords={Graphlets, network motifs, induced subgraphs, higher-order network analysis, higher-order graph mining, higher-order link prediction, interactive visual analytics, interactive visual graph mining, machine learning}, active={true}, } @inproceedings{rossi18-bigdata, title={Relational Similarity Machines (RSM): A Similarity-based Learning Framework for Graphs}, author={Ryan A. Rossi and Rong Zhou and Nesreen K. Ahmed and Hoda Eldardiry}, booktitle={IEEE BigData}, pages={10}, year={2018}, pdf={pubs/rossi-et-al-rsm-bigdata18.pdf}, pubtype={conference}, slides={talks/BigData18-RSM-talk.pdf}, abstract={Relational machine learning has become increasingly important due to the recent proliferation and ubiquity of network data. However, existing methods are not designed for interactive learning and have many unrealistic assumptions that greatly limit their utility in practice. For instance, most existing work has focused on graphs with high relational autocorrelation (homophily) and perform poorly otherwise. To overcome these limitations, this paper presents a similarity-based relational learning framework called Relational Similarity Machines (RSM) for networks with arbitrary relational autocorrelation. The RSM framework is designed to be fast, accurate, and flexible for learning on a wide variety of networks. The experiments demonstrate the effectiveness of the RSM framework.}, keywords={heterophily, semi-supervised learning, collective classification, similarity-based graph learning, statistical relational learning, interactive relational learning, kernel functions}, active={true} } @inproceedings{chen-cikm18, title={Predictive Analysis by Leveraging Temporal User Behavior and User Embeddings}, author={Charles Chen and Sungchul Kim and Hung Bui and Ryan A. Rossi and Branislav Kveton and Eunyee Koh and Razvan Bunescu}, booktitle={CIKM}, pages={1--10}, year={2018}, pdf={pubs/CIKM18-pred-analysis-user-embeddings.pdf}, pubtype={conference}, abstract={The rapid growth of mobile devices has resulted in the generation of a large number of user behavior logs that contain latent intentions and user interests. However, exploiting such data in real-world applications is still difficult for service providers due to the complexities of user behavior over a sheer number of possible actions that can vary according to time. In this work, a time-aware RNN model, TRNN, is proposed for predictive analysis from user behavior data. First, our approach predicts next user action more accurately than the baselines including the n-gram models as well as two recently introduced time-aware RNN approaches. Second, we use TRNN to learn user embeddings from sequences of user actions and show that overall the TRNN embeddings outperform conventional RNN embeddings. Similar to how word embeddings benefit a wide range of task in natural language processing, the learned user embeddings are general and could be used in a variety of tasks in the digital marketing area. This claim is supported empirically by evaluating their utility in user conversion prediction, and preferred application prediction. According to the evaluation results, TRNN embeddings perform better than the baselines including Bag of Words (BoW), TF$\cdot$IDF and Doc2Vec. We believe that TRNN embeddings provide an effective representation for solving practical tasks such as recommendation, user segmentation and predictive analysis of business metrics.}, keywords={deep graph models,recurrent neural networks, user behavior modeling, user behavior modeling, user embeddings, action embeddings, time-aware embeddings, TRNN, LSTM, representation learning, deep learning, temporal information, predictive analysis, next user action prediction, machine learning}, active={true}, } @inproceedings{rossi-WWW18, author={Ryan A. Rossi and Nesreen K. Ahmed and Eunyee Koh}, title={Higher-Order Network Representation Learning}, booktitle={Proceedings of the 27th International Conference Companion on World Wide Web (WWW)}, year={2018}, date={January 28, 2018}, pages={}, pdf={pubs/rossi-et-al-WWW18.pdf}, preprint={}, supp={}, poster={poster/WWW18-HONE.pdf}, pubtype={conference}, abstract={This paper describes a general framework for learning Higher-Order Network Embeddings (HONE) from graph data based on network motifs. The HONE framework is highly expressive and flexible with many interchangeable components. The experimental results demonstrate the effectiveness of learning higher-order network representations. In all cases, HONE outperforms recent embedding methods that are unable to capture higher-order structures with a mean relative gain in AUC of 19% (and up to 75% gain) across a wide variety of networks and embedding methods.}, keywords={higher-order network embeddings, graphlets, network motifs, induced subgraph patterns, orbits, higher-order network analysis, network representation learning, graph feature learning, graph representation learning, node embeddings, higher-order graph features, inductive graph features, attributed graphs, feature diffusion, higher-order graph embeddings, higher-order organization, higher-order network modules, higher-order connectivity patterns, HONE}, active={true}, } @inproceedings{rossi-WWW18-BigNet, author={Ryan A. Rossi and Rong Zhou and Nesreen K. Ahmed}, title={Deep Inductive Network Representation Learning}, booktitle={Proceedings of the 3rd International Workshop on Learning Representations for Big Networks (WWW BigNet)}, year={2018}, pages={8}, pdf={pubs/rossi-et-al-WWW18-BigNet.pdf}, preprint={}, supp={}, pubtype={workshop}, abstract={This paper presents a general graph representation learning framework called DeepGL for learning deep node and edge representations from large (attributed) graphs. In particular, DeepGL begins by deriving a set of base features (e.g., graphlet features) and automatically learns a multi-layered hierarchical graph representation where each successive layer leverages the output from the previous layer to learn features of a higher-order. Contrary to previous work, DeepGL learns relational functions (each representing a feature) that generalize across-networks and therefore useful for graph-based transfer learning tasks. Moreover, DeepGL naturally supports attributed graphs, learns interpretable graph representations, and is space-efficient (by learning sparse feature vectors). In addition, DeepGL is expressive, flexible with many interchangeable components, efficient with a time complexity of $\mathcal{O}(|E|)$, and scalable for large networks via an efficient parallel implementation. Compared with the state-of-the-art method, DeepGL is (1) effective for across-network transfer learning tasks and attributed graph representation learning, (2) space-efficient requiring up to 6x less memory, (3) fast with up to 182x speedup in runtime performance, and (4) accurate with an average improvement of 20% or more on many learning tasks.}, keywords={Inductive network representation learning, inductive learning, network representation learning, Graph feature learning, graph representation learning, deep graph features, relational functions, relational feature operators, higher-order features, transfer learning, inductive graph features, attributed graphs, node/edge features, node feature learning, edge feature learning, relational deep learning, deep graph learning, hierarchical graph representation, feature diffusion, graphlets, network motifs, induced subgraph patterns, deep learning}, active={true}, } @inproceedings{role2vec, title={role2vec: Role-based Network Embeddings}, author={Nesreen K. Ahmed and Ryan A. Rossi and John Boaz Lee and Theodore L. Willke and Rong Zhou and Xiangnan Kong and Hoda Eldardiry}, booktitle={DLG KDD}, fullbooktitle={Deep Learning on Graphs: Methods and Applications (DLG’19)}, year={2019}, pdf={pubs/role2vec-DLG-KDD.pdf}, pubtype={workshop}, abstract={Random walks are at the heart of many existing network embedding methods. However, such methods have many limitations that arise from the use of traditional random walks, e.g., the embeddings resulting from these methods primarily capture proximity (communities) among the vertices as opposed to structural similarity (roles). In this work, we introduce the Role2Vec framework which uses the flexible notion of attributed random walks, and serves as a basis for generalizing existing methods such as DeepWalk, node2vec, and many others that leverage random walks. Our proposed framework enables these methods to be more widely applicable by learning functions that capture the behavioral roles of the nodes. We show that our proposed framework is effective with an average AUC improvement of 16.55% while requiring on average 853x less space than existing methods.}, keywords={feature-based walk, attributed walk, roles, positions, network representation learning, node embeddings, graph embeddings, role-based embeddings, role discovery, inductive representation learning, attributed networks, graphlets, attributed random walks, labeled random walks}, active={true}, } @inproceedings{rossi18tnnls, title={Estimation of Graphlet Counts in Massive Networks}, author={Ryan A. Rossi and Rong Zhou and Nesreen K. Ahmed}, booktitle={IEEE Transactions on Neural Networks and Learning Systems (TNNLS)}, pages={44--57}, year={2018}, submitted={2016}, pdf={pubs/TNNLS18-Global-and-Local-Graphlet-Estimation.pdf}, preprint={http://arxiv.org/abs/1701.01772}, doi={https://10.1109/TNNLS.2018.2826529}, pubtype={journal}, abstract={Graphlets are induced subgraphs of a large network and are important for understanding and modeling complex networks. Despite their practical importance, graphlets have been severely limited to applications and domains with relatively small graphs. Most previous work has focused on exact algorithms, however, it is often too expensive to compute graphlets exactly in massive networks with billions of edges, and finding an approximate count is usually sufficient for many applications. In this work, we propose an unbiased graphlet estimation framework that is (a) fast with large speedups compared to the state-of-the-art, (b) parallel with nearly linear-speedups, (c) accurate with less than $1\%$ relative error, (d) scalable and space-efficient for massive networks with billions of edges, and (e) effective for a variety of real-world settings, as well as estimating global and local graphlet statistics (e.g., counts). On 300 networks from 20 domains, we obtain <$1\%$ relative error for all graphlets. This is vastly more accurate than existing methods while using less data. Moreover, it takes a few seconds on billion edge graphs (as opposed to days/weeks). These are by far the largest graphlet computations to date.}, keywords={Graphlets, network motifs, induced subgraphs, estimation methods, unbiased graphlet estimation, local graphlet count estimation, graphlet statistics, parallel algorithms, higher-order network analysis, machine learning}, active={true}, } @inproceedings{nguyen-WWW18, author={Giang Hoang Nguyen and John Boaz Lee and Ryan A. Rossi and Nesreen K. Ahmed and Eunyee Koh and Sungchul Kim}, title={Continuous-Time Dynamic Network Embeddings}, booktitle={Proceedings of the 3rd International Workshop on Learning Representations for Big Networks (WWW BigNet)}, year={2018}, pages={}, pdf={pubs/nguyen-et-al-WWW18-BigNet.pdf}, preprint={}, supp={}, pubtype={workshop}, abstract={Networks evolve continuously over time with the addition, deletion, and changing of links and nodes. Although many networks contain this type of temporal information, the majority of research in network representation learning has focused on static snapshots of the graph and has largely ignored the temporal dynamics of the network. In this work, we describe a general framework for incorporating temporal information into network embedding methods. The framework gives rise to methods for learning time-respecting embeddings from continuous-time dynamic networks. Overall, the experiments demonstrate the effectiveness of the proposed framework and dynamic network embedding approach as it achieves an average gain of 11.9% across all methods and graphs. The results indicate that modeling temporal dependencies in graphs is important for learning appropriate and meaningful network representations.}, keywords={dynamic network embeddings, temporal network embeddings, continious-time dynamic network embeddings, inductive network representation learning, network representation learning, graph representation learning, node embeddings, dynamic networks, temporal networks, graph streams, continious-time dynamic networks}, active={true}, } @inproceedings{role2vec-ijcai18, title={Learning Role-based Graph Embeddings}, author={Nesreen K. Ahmed and Ryan A. Rossi and Rong Zhou and John Boaz Lee and Xiangnan Kong and Theodore L. Willke and Hoda Eldardiry}, booktitle={StarAI IJCAI}, year={2018}, pdf={https://arxiv.org/pdf/1802.02896.pdf}, preprint={https://arxiv.org/pdf/1802.02896.pdf}, supp={https://arxiv.org/pdf/1802.02896.pdf}, pubtype={workshop}, abstract={Random walks are at the heart of many existing network embedding methods. However, such algorithms have many limitations that arise from the use of random walks, e.g., the features resulting from these methods are unable to transfer to new nodes and graphs as they are tied to vertex identity. In this work, we introduce the Role2Vec framework which uses the flexible notion of attributed random walks, and serves as a basis for generalizing existing methods such as DeepWalk, node2vec, and many others that leverage random walks. Our proposed framework enables these methods to be more widely applicable for both transductive and inductive learning as well as for use on graphs with attributes (if available). This is achieved by learning functions that generalize to new nodes and graphs. We show that our proposed framework is effective with an average AUC improvement of 16.55% while requiring on average 853x less space than existing methods on a variety of graphs.}, keywords={network representation learning, node embeddings, graph embeddings, role-based embeddings, role discovery, inductive representation learning, attributed networks, graphlets, attributed random walks, labeled random walks}, active={true}, } @inproceedings{rossi-HONE-arxiv, author={Ryan A. Rossi and Nesreen K. Ahmed and Eunyee Koh and Sungchul Kim and Anup Rao and Yasin Abbasi-Yadkori}, title={HONE: Higher-Order Network Embeddings}, booktitle={arXiv:1801.09303}, year={2018}, pages={}, pdf={https://arxiv.org/pdf/1801.09303.pdf}, preprint={https://arxiv.org/pdf/1801.09303.pdf}, supp={https://arxiv.org/pdf/1801.09303.pdf}, pubtype={}, abstract={This paper describes a general framework for learning Higher-Order Network Embeddings (HONE) from graph data based on network motifs. The HONE framework is highly expressive and flexible with many interchangeable components. The experimental results demonstrate the effectiveness of learning higher-order network representations. In all cases, HONE outperforms recent embedding methods that are unable to capture higher-order structures with a mean relative gain in AUC of 19% (and up to 75% gain) across a wide variety of networks and embedding methods.}, keywords={higher-order network embeddings, graphlets, network motifs, induced subgraph patterns, orbits, higher-order network analysis, network representation learning, graph feature learning, graph representation learning, node embeddings, higher-order graph features, inductive graph features, attributed graphs, feature diffusion, higher-order graph embeddings, higher-order organization, higher-order network modules, higher-order connectivity patterns, HONE, deep learning}, active={true}, } @inproceedings{canning18-pred, title={Predicting Graph Categories from Structural Properties}, author={James P. Canning and Emma E. Ingram and Sammantha Nowak-Wolff and Adriana M. Ortiz and Nesreen K. Ahmed and Ryan A. Rossi and Karl R. B. Schmitt and Sucheta Soundarajan}, booktitle={arXiv:1805.02682}, pages={1--10}, year={2018}, pdf={pubs/predicting-graph-categories-preprint.pdf}, pubtype={}, abstract={Complex networks are often categorized according to the underlying phenomena that they represent such as molecular interactions, re-tweets, and brain activity. In this work, we investigate the problem of predicting the category (domain) of arbitrary networks. This includes complex networks from different domains as well as synthetically generated graphs from five different network models. A classification accuracy of 96.6% is achieved using a random forest classifier with both real and synthetic networks. This work makes two important findings. First, our results indicate that complex networks from various domains have distinct structural properties that allow us to predict with high accuracy the category of a new previously unseen network. Second, synthetic graphs are trivial to classify as the classification model can predict with near-certainty the network model used to generate it. Overall, the results demonstrate that networks drawn from different domains (and network models) are trivial to distinguish using only a handful of simple structural properties.}, keywords={network classification, network categorization, graph classification, graph features, massive graphs, big data, massive networks, machine learning, structural properties, across-domain graph classification, network science, complex networks, graph similarity, graph matching}, active={false}, } @article{rossi2018compressing-graphs-cliques, title={GraphZIP: A Clique-based Sparse Graph Compression Method}, author={Ryan A. Rossi and Rong Zhou}, journal={Journal of Big Data}, year={2018}, volume={5}, number={1}, pages={14}, preprint={}, doi={10.1186/s40537-018-0121-z}, pdf={pubs/rossi-et-al-2018-Journal-of-Big-Data.pdf}, url={https://doi.org/10.1186/s40537-018-0121-z}, abstract={Massive graphs are ubiquitous and at the heart of many real-world applications ranging from the World Wide Web to social networks. As a result, techniques for compressing graphs have become increasingly important. In this work, we propose a graph compression and encoding framework called GraphZIP based on the observation that real-world graphs often form many cliques of a large size. Using this as a foundation, we decompose the graph into a set of large cliques, which is then used to encode the graph succinctly. In particular, disk-resident and in-memory graph encodings are proposed and shown to be effective with three important benefits. First, it reduces the space needed to store the graph on disk and in-memory. Second, GraphZIP reduces IO traffic involved in using the graph. Third, it can often reduce the work involved in running an algorithm on the graph. The experiments demonstrate the scalability, flexibility, and effectiveness of the clique-based compression techniques.}, pubtype={journal}, active={true}, } @article{rossi2018ker, title={Relational Time Series Forecasting}, author={Ryan A. Rossi}, journal={Knowledge Engineering Review (KER)}, year={2018}, volume={33}, pages={e1}, preprint={}, publisher={Cambridge University Press}, DOI={10.1017/S0269888918000024}, pdf={pubs/rossi-KER18-RTSF.pdf}, submissionDate={October 1, 2015}, abstract={Networks encode dependencies between entities (people, computers, proteins) and allow us to study phenomena across social, technological, and biological domains. These networks naturally evolve over time by the addition, deletion, and changing of links, nodes, and attributes. Despite the importance of modeling these dynamics, existing work in relational machine learning (RML) has ignored relational time series data. Relational time series learning lies at the intersection of traditional time series analysis and Statistical Relational Learning (SRL), and bridges the gap between these two fundamentally important problems. This paper formulates the relational time series learning problem, and a general framework and taxonomy for representation discovery tasks of both nodes and links including predicting their existence, label, and weight (importance), as well as systematically constructing features. We also reinterpret the prediction task leading to the proposal of two important relational time series forecasting tasks consisting of (i) relational time series classification (predicts a future class or label of an entity), and (ii) relational time series regression (predicts a future real-valued attribute or weight). Relational time series models are designed to leverage both relational and temporal dependencies to minimize forecasting error for both relational time series classification and regression. Finally, we discuss challenges and open problems that remain to be addressed.}, keywords={Relational Time Series Modeling, Relational Learning, Statistical Relational Learning, Time Series Forecasting, Temporal Networks, Classification, Regression, Graph-based Learning, Graph Models, Dynamic Network Embeddings}, pubtype={journal}, active={true}, } @article{rossi2017graphvis, title={Interactive Visual Graph Mining and Learning}, author={Ryan A. Rossi and Nesreen K. Ahmed and Hoda Eldardiry and Rong Zhou}, journal={ACM Transactions on Intelligent Systems and Technology}, year={2018}, pages={1--30}, preprint={pubs/rossi-et-al-TIST18.pdf}, supp={pubs/rossi-et-al-TIST18-supp.pdf}, pdf={pubs/rossi-et-al-TIST18.pdf}, abstract={This paper presents a platform for interactive graph mining and relational machine learning called GraphVis. The platform combines interactive visual representations with state-of-the-art graph mining and relational machine learning techniques to aid in revealing important insights quickly as well as learning an appropriate and highly predictive model for a particular task (e.g., classification, link prediction, discovering the roles of nodes, finding influential nodes). Visual representations and interaction techniques and tools are developed for simple, fast, and intuitive real-time interactive exploration, mining, and modeling of graph data. In particular, we propose techniques for interactive relational learning (e.g., node/link classification), interactive link prediction and weighting, role discovery and community detection, higher-order network analysis (via graphlets, network motifs), among others. GraphVis also allows for the refinement and tuning of graph mining and relational learning methods for specific application domains and constraints via an end-to-end interactive visual analytic pipeline that learns, infers, and provides rapid interactive visualization with immediate feedback at each change/prediction in real-time. Other key aspects include interactive filtering, querying, ranking, manipulating, exporting, as well as tools for dynamic network analysis and visualization, interactive graph generators (including new block model approaches), and a variety of multi-level network analysis techniques.}, keywords={interactive relational machine learning, interactive visual graph mining, interactive network analysis, interactive network visualization, interactive graph learning, higher-order network analysis, interactive role discovery, link prediction, node embeddings, interactive graph generation, rapid visual feedback, direct manipulation, real-time performance}, pubtype={journal}, active={true}, } @inproceedings{ahmed17learning-attr-graphs, title={Inductive Representation Learning in Large Attributed Graphs}, author={Nesreen K. Ahmed and Ryan A. Rossi and Rong Zhou and John Boaz Lee and Xiangnan Kong and Theodore L. Willke and Hoda Eldardiry}, booktitle={WiML NIPS}, pages={}, year={2017}, pdf={pubs/inductive-network-representation-learning.pdf}, preprint={https://arxiv.org/abs/1710.09471}, supp={https://arxiv.org/abs/1710.09471}, arxivPDF={https://arxiv.org/pdf/1710.09471.pdf}, pubtype={workshop}, abstract={Learning a useful feature representation from graph data lies at the heart and success of many machine learning tasks such as classification, anomaly detection, link prediction, among many others. Many existing techniques use random walks as a basis for learning features or estimating the parameters of a graph model for a downstream prediction task. Examples include recent node embedding methods such as DeepWalk, node2vec, as well as graph-based deep learning algorithms. However, the simple random walk used by these methods is fundamentally tied to the identity of the node. This has three main disadvantages. First, these approaches are inherently transductive and do not generalize to unseen nodes and other graphs. Second, they are not space-efficient as a feature vector is learned for each node which is impractical for large graphs. Third, most of these approaches lack support for attributed graphs. To make these methods more generally applicable, we propose a framework for inductive network representation learning based on the notion of attributed random walk that is not tied to node identity and is instead based on learning a function $\phi : \mathbf{x} \rightarrow w$ that maps a node attribute vector to a type. This framework serves as a basis for generalizing existing methods such as DeepWalk, node2vec, and many other previous methods that leverage traditional random walks.}, keywords={network representation learning, roles, structural similarity, graph embeddings, feature learning, relational functions, random walks, deep learning, graph-based deep learning, machine learning, network embedding, inductive representation learning}, location = {Long Beach, CA USA}, active={true}, } @inproceedings{lee17-Deep-Graph-Attention, title={Deep Graph Attention Model}, author={John Boaz Lee and Ryan Rossi and Xiangnan Kong}, booktitle={arXiv:1709.06075}, pages={1--8}, year={2017}, pdf={pubs/deep-graph-attention.pdf}, preprint={https://arxiv.org/abs/1709.06075}, supp={https://arxiv.org/abs/1709.06075}, arxivPDF={https://arxiv.org/pdf/1709.06075.pdf}, pubtype={}, abstract={Graph classification is a problem with practical applications in many different domains. Most of the existing methods take the entire graph into account when calculating graph features. In a graphlet-based approach, for instance, the entire graph is processed to get the total count of different graphlets or sub-graphs. In the real-world, however, graphs can be both large and noisy with discriminative patterns confined to certain regions in the graph only. In this work, we study the problem of attentional processing for graph classification. The use of attention allows us to focus on small but informative parts of the graph, avoiding noise in the rest of the graph. We present a novel RNN model, called the Graph Attention Model (GAM), that processes only a portion of the graph by adaptively selecting a sequence of "interesting" nodes. The model is equipped with an external memory component which allows it to integrate information gathered from different parts of the graph. We demonstrate the effectiveness of the model through various experiments.}, keywords={graph attention, attention model, deep graph models, reinforcement learning, recurrent neural networks, graph representation, representation learning, graph embeddings, graph feature learning, graph classification, bioinformatics, deep learning, graph-based deep learning, machine learning}, active={true}, } @inproceedings{ahmed17Gen-Deep-Graph-Learning, title={A Framework for Generalizing Graph-based Representation Learning Methods}, author={Nesreen K. Ahmed and Ryan A. Rossi and Rong Zhou and John Boaz Lee and Xiangnan Kong and Theodore L. Willke and Hoda Eldardiry}, booktitle={arXiv:1709.04596}, pages={1--8}, year={2017}, pdf={pubs/ahmed-et-al-representation-learning-in-graphs.pdf}, preprint={https://arxiv.org/abs/1709.04596}, supp={pubs/ahmed-et-al-representation-learning-in-graphs.pdf}, pubtype={}, abstract={Random walks are at the heart of many existing deep learning algorithms for graph data. However, such algorithms have many limitations that arise from the use of random walks, e.g., the features resulting from these methods are unable to transfer to new nodes and graphs as they are tied to node identity. In this work, we introduce the notion of attributed random walks which serves as a basis for generalizing existing methods such as DeepWalk, node2vec, and many others that leverage random walks. Our proposed framework makes these methods more widely applicable for both transductive and inductive learning (by learning functions that generalize to new nodes and graphs) as well as for use on graphs with attributes (if available). Finally, the approach is shown to be effective with an average AUC improvement of 16.1\% while requiring on average 853 times less space than existing methods on a variety of graphs from several domains.}, keywords={graph representation, representation learning, graph embeddings, feature learning, relational functions, random walks, network embedding, inductive representation learning, inductive learning, attributed networks, attributed network representation learning, deep learning, graph-based deep learning, machine learning}, active={true}, } @inproceedings{ahmed17attrRandomWalks, title={Generalizing Deep Learning in Graphs using Attributed Random Walks}, author={Nesreen K. Ahmed and Ryan A. Rossi and Rong Zhou and John Boaz Lee and Xiangnan Kong and Theodore L. Willke and Hoda Eldardiry}, booktitle={arXiv:1709.04596}, pages={1--8}, year={2017}, pdf={pubs/ahmed-et-al-generalizing-deep-graph-learning.pdf}, preprint={}, supp={pubs/ahmed-et-al-generalizing-deep-graph-learning.pdf}, pubtype={}, abstract={Random walks are at the heart of many existing deep learning algorithms for graph data. However, such algorithms have many limitations that arise from the use of random walks, e.g., the features resulting from these methods are unable to transfer to new nodes and graphs as they are tied to node identity. In this work, we introduce the notion of attributed random walks which serves as a basis for generalizing existing methods such as DeepWalk, node2vec, and many others that leverage random walks. Our proposed framework makes these methods more widely applicable for both transductive and inductive learning (by learning functions that generalize to new nodes and graphs) as well as for use on graphs with attributes (if available). Finally, the approach is shown to be effective with an average AUC improvement of 16.1\% while requiring on average 853 times less space than existing methods on a variety of graphs from several domains.}, keywords={graph representation, representation learning, graph embeddings, feature learning, relational functions, random walks, deep learning, network embedding, inductive representation learning, inductive learning, attributed networks, attributed network representation learning, graph-based deep learning, machine learning}, active={true}, } @inproceedings{network-classification, title={Network Classification and Categorization}, author={James P. Canning and Emma E. Ingram and Sammantha Nowak-Wolff and Adriana M. Ortiz and Nesreen K. Ahmed and Ryan A. Rossi and Karl R. B. Schmitt and Sucheta Soundarajan}, booktitle={International Conference on Complex Networks (CompleNet)}, pages={}, year={2018}, pdf={pubs/network-classification.pdf}, preprint={https://arxiv.org/abs/1709.04481}, supp={pubs/network-classification.pdf}, arxiv={https://arxiv.org/pdf/1709.04481.pdf}, pubtype={}, abstract={To the best of our knowledge, this paper presents the first large-scale study that tests whether network categories (e.g., social networks vs. web graphs) are distinguishable from one another (using both categories of real-world networks and synthetic graphs). A classification accuracy of 94.2% was achieved using a random forest classifier with both real and synthetic networks. This work makes two important findings. First, real-world networks from various domains have distinct structural properties that allow us to predict with high accuracy the category of an arbitrary network. Second, classifying synthetic networks is trivial as our models can easily distinguish between synthetic graphs and the real-world networks they are supposed to model.}, keywords={network classification, network categorization, graph classification, graph features, massive graphs, big data, massive networks, machine learning}, active={true}, } @inproceedings{network-classification-arxiv, title={Network Classification and Categorization}, author={James P. Canning and Emma E. Ingram and Sammantha Nowak-Wolff and Adriana M. Ortiz and Nesreen K. Ahmed and Ryan A. Rossi and Karl R. B. Schmitt and Sucheta Soundarajan}, booktitle={arXiv preprint}, pages={}, month={September}, year={2017}, pdf={pubs/network-classification.pdf}, preprint={https://arxiv.org/abs/1709.04481}, supp={pubs/network-classification.pdf}, arxiv={https://arxiv.org/pdf/1709.04481.pdf}, pubtype={}, abstract={To the best of our knowledge, this paper presents the first large-scale study that tests whether network categories (e.g., social networks vs. web graphs) are distinguishable from one another (using both categories of real-world networks and synthetic graphs). A classification accuracy of 94.2% was achieved using a random forest classifier with both real and synthetic networks. This work makes two important findings. First, real-world networks from various domains have distinct structural properties that allow us to predict with high accuracy the category of an arbitrary network. Second, classifying synthetic networks is trivial as our models can easily distinguish between synthetic graphs and the real-world networks they are supposed to model.}, keywords={network classification, network categorization, graph classification, graph features, massive graphs, big data, massive networks, machine learning}, active={false}, } @inproceedings{ahmed17streams, title={On Sampling from Massive Graph Streams}, author={Nesreen K. Ahmed and Nick Duffield and Theodore L. Willke and Ryan A. Rossi}, booktitle={VLDB}, pages={1430--1441}, year={2017}, location={Munich, Germany}, external={http://www.vldb.org/pvldb/vol10/p1430-ahmed.pdf}, pdf={pubs/ahmed-et-al-2017-arxiv-on-sampling-massive-graph-streams.pdf}, preprint={https://arxiv.org/abs/1703.02625}, supp={pubs/ahmed-et-al-2017-arxiv-on-sampling-massive-graph-streams.pdf}, pubtype={conference}, abstract={We propose Graph Priority Sampling (GPS), a new paradigm for order-based reservoir sampling from massive streams of graph edges. GPS provides a general way to weight edge sampling according to auxiliary and/or size variables so as to accomplish various estimation goals of graph properties. In the context of subgraph counting, we show how edge sampling weights can be chosen so as to minimize the estimation variance of counts of specified sets of subgraphs. In distinction with many prior graph sampling schemes, GPS separates the functions of edge sampling and subgraph estimation. We propose two estimation frameworks: (1) Post-Stream estimation, to allow GPS to construct a reference sample of edges to support retrospective graph queries, and (2) In-Stream estimation, to allow GPS to obtain lower variance estimates by incrementally updating the subgraph count estimates during stream processing. Unbiasedness of subgraph estimators is established through a new Martingale formulation of graph stream order sampling, which shows that subgraph estimators, written as a product of constituent edge estimators are unbiased, even when computed at different points in the stream. The separation of estimation and sampling enables significant resource savings relative to previous work. We illustrate our framework with applications to triangle and wedge counting. We perform a large-scale experimental study on real-world graphs from various domains and types. GPS achieves high accuracy with less than 1% error for triangle and wedge counting, while storing a small fraction of the graph with average update times of a few microseconds per edge. Notably, for a large Twitter graph with more than 260M edges, GPS accurately estimates triangle counts with less than 1% error, while storing only 40K edges.}, keywords={graph streams, graph priority sampling, GPS, reservoir sampling, network sampling, edge sampling, sampling, graph sample and hold, online estimation, online learning, graphlets, motifs, statistical estimation, unbiased estimators, subgraph counts, network motifs, motif statistics, massive graphs, big data, massive networks, machine learning}, active={true}, } @inproceedings{rossi-deepGL-arxiv, author={Ryan A. Rossi and Rong Zhou and Nesreen K. Ahmed}, title={Deep Feature Learning for Graphs}, booktitle={arXiv:1704.08829}, year={2017}, pages={1-11}, pdf={pubs/rossi-et-al-2017-arxiv-deepGL-deep-graph-learning.pdf}, preprint={https://arxiv.org/abs/1704.08829}, supp={pubs/rossi-et-al-2017-arxiv-deepGL-deep-graph-learning.pdf}, pubtype={}, abstract={This paper presents a general graph representation learning framework called DeepGL for learning deep node and edge representations from large (attributed) graphs. In particular, DeepGL begins by deriving a set of base features (e.g., graphlet features) and automatically learns a multi-layered hierarchical graph representation where each successive layer leverages the output from the previous layer to learn features of a higher-order. Contrary to previous work, DeepGL learns relational functions (each representing a feature) that generalize across-networks and therefore useful for graph-based transfer learning tasks. Moreover, DeepGL naturally supports attributed graphs, learns interpretable graph representations, and is space-efficient (by learning sparse feature vectors). In addition, DeepGL is expressive, flexible with many interchangeable components, efficient with a time complexity of $\mathcal{O}(|E|)$, and scalable for large networks via an efficient parallel implementation. Compared with the state-of-the-art method, DeepGL is (1) effective for across-network transfer learning tasks and attributed graph representation learning, (2) space-efficient requiring up to 6x less memory, (3) fast with up to 182x speedup in runtime performance, and (4) accurate with an average improvement of 20% or more on many learning tasks.}, keywords={Inductive network representation learning, inductive learning, network representation learning, Graph feature learning, graph representation learning, deep graph features, relational functions, relational feature operators, higher-order features, transfer learning, inductive graph features, attributed graphs, node/edge features, node feature learning, edge feature learning, relational deep learning, deep graph learning, hierarchical graph representation, feature diffusion, graphlets, network motifs, induced subgraph patterns, deep learning}, active={true}, } @inproceedings{rossi18-sml, title={Similarity-based Multi-label Learning}, author={Ryan A. Rossi and Nesreen K. Ahmed and Hoda Eldardiry and Rong Zhou}, booktitle={International Joint Conference on Neural Networks (IJCNN)}, pages={1--8}, year={2018}, pdf={pubs/rossi-IJCNN18-SML.pdf}, preprint={https://arxiv.org/abs/1710.10335}, supp={https://arxiv.org/pdf/1710.10335.pdf}, arxivPDF={https://arxiv.org/abs/1710.10335}, pubtype={conference}, abstract={Multi-label classification is an important learning problem with many applications. In this work, we propose a similarity-based approach for multi-label learning called SML. We also introduce a similarity-based approach for predicting the label set size. SML is amenable to streaming data and online learning, naturally able to handle changes in the problem domain, robust to training data with skewed class label sets, accurate with low variance, and lends itself to an efficient parallel implementation. The experimental results demonstrate the effectiveness of SML for multi-label classification where it is shown to compare favorably with a wide variety of existing algorithms across a range of evaluation criterion.}, keywords={multi-label classification, multi-label learning, kernel learning, similarity-based multi-label learning, similarity function learning, image classification, gene function classification}, active={true}, } @inproceedings{rossi17-sml, title={Similarity-based Multi-label Learning}, author={Ryan A. Rossi and Nesreen K. Ahmed and Hoda Eldardiry and Rong Zhou}, booktitle={arXiv:1710.10335}, pages={1--6}, year={2017}, pdf={https://arxiv.org/pdf/1710.10335.pdf}, preprint={https://arxiv.org/abs/1710.10335}, supp={}, arxivPDF={https://arxiv.org/abs/1710.10335}, pubtype={}, abstract={Multi-label classification is an important learning problem with many applications. In this work, we propose a similarity-based approach for multi-label learning called SML. We also introduce a similarity-based approach for predicting the label set size. SML is amenable to streaming data and online learning, naturally able to handle changes in the problem domain, robust to training data with skewed class label sets, accurate with low variance, and lends itself to an efficient parallel implementation. The experimental results demonstrate the effectiveness of SML for multi-label classification where it is shown to compare favorably with a wide variety of existing algorithms across a range of evaluation criterion.}, keywords={multi-label classification, multi-label learning, kernel learning, similarity-based multi-label learning, similarity function learning, image classification, gene function classification}, active={false}, } @inproceedings{rossi17graphlet-est, title={Estimation of Graphlet Statistics}, author={Ryan A. Rossi and Rong Zhou and Nesreen K. Ahmed}, booktitle={arXiv:1701.01772}, pages={1--14}, year={2017}, pdf={pubs/rossi-et-al-estimation-of-graphlet-statistics.pdf}, preprint={http://arxiv.org/abs/1701.01772}, supp={pubs/rossi-et-al-estimation-of-graphlet-statistics.pdf}, pubtype={}, abstract={Graphlets are induced subgraphs of a large network and are important for understanding and modeling complex networks. Despite their practical importance, graphlets have been severely limited to applications and domains with relatively small graphs. Most previous work has focused on exact algorithms, however, it is often too expensive to compute graphlets exactly in massive networks with billions of edges, and finding an approximate count is usually sufficient for many applications. In this work, we propose an unbiased graphlet estimation framework that is (a) fast with significant speedups compared to the state-of-the-art, (b) parallel with nearly linear-speedups, (c) accurate with <1% relative error, (d) scalable and space-efficient for massive networks with billions of edges, and (e) flexible for a variety of real-world settings, as well as estimating macro and micro-level graphlet statistics (e.g., counts) of both connected and disconnected graphlets. In addition, an adaptive approach is introduced that finds the smallest sample size required to obtain estimates within a given user-defined error bound. On 300 networks from 20 domains, we obtain <1% relative error for all graphlets. This is significantly more accurate than existing methods while using less data. Moreover, it takes a few seconds on billion edge graphs (as opposed to days/weeks). These are by far the largest graphlet computations to date.}, keywords={Graphlets, motifs, statistical estimation, unbiased estimators, subgraph counts, network motifs, motif statistics, massive networks, parallel algorithms, graph mining, graph kernels, machine learning}, active={true}, } @inproceedings{ahmed2017roles, title={Edge Role Discovery via Higher-order Structures}, author={Nesreen K. Ahmed and Ryan A. Rossi and Theodore L. Willke and Rong Zhou}, booktitle={PAKDD}, pages={1--12}, year={2017}, location={Jeju Island, Korea}, date={May 2017}, acceptance={28.2%}, publisher={Springer}, pdf={pubs/ahmed-et-al-pakdd17-preprint.pdf}, abstract={Previous work in network analysis has focused on modeling the roles of nodes in graphs. In this paper, we introduce edge role discovery and propose a framework for learning and extracting edge roles from large graphs. We also propose a general class of higher-order role models that leverage network motifs. This leads us to develop a novel edge feature learning approach for role discovery that begins with higher-order network motifs and automatically learns deeper edge features. All techniques are parallelized and shown to scale well. They are also efficient with a time complexity of $\mathcal{O}(|E|)$. The experiments demonstrate the effectiveness of our model for a variety of ML tasks such as improving classification and dynamic network analysis.}, notes={Previous work in network analysis has focused on modeling the mixed-memberships of node roles in the graph, but not the roles of edges. We introduce the edge role discovery problem and present a generalizable framework for learning and extracting edge roles from arbitrary graphs automatically. Furthermore, while existing node-centric role models have mainly focused on simple degree and egonet features, this work also explores graphlet features for role discovery. In addition, we also develop an approach for automatically learning and extracting important and useful edge features from an arbitrary graph. The experimental results demonstrate the utility of edge roles for network analysis tasks on a variety of graphs from various problem domains.}, keywords={higher-order role discovery, graphlet-based roles, edge roles, role discovery, graph similarity, behavioral roles, higher-order structures, latent space model, higher-order network model}, pubtype={conference}, active={true}, } @article{ahmed2016revisiting, title={Revisiting Role Discovery in Networks: From Node to Edge Roles}, author={Nesreen K. Ahmed and Ryan A. Rossi and Theodore L. Willke and Rong Zhou}, journal={arXiv preprint arXiv:1610.00844}, year={2016}, pdf={pubs/edge-role-discovery.pdf}, abstract={Previous work in network analysis has focused on modeling the mixed-memberships of node roles in the graph, but not the roles of edges. We introduce the edge role discovery problem and present a generalizable framework for learning and extracting edge roles from arbitrary graphs automatically. Furthermore, while existing node-centric role models have mainly focused on simple degree and egonet features, this work also explores graphlet features for role discovery. In addition, we also develop an approach for automatically learning and extracting important and useful edge features from an arbitrary graph. The experimental results demonstrate the utility of edge roles for network analysis tasks on a variety of graphs from various problem domains.}, pubtype={}, } @inproceedings{ahmed17aaai, title={A Higher-order Latent Space Network Model}, author={Nesreen K. Ahmed and Ryan A. Rossi and Theodore L. Willke and Rong Zhou}, booktitle={Proceedings of the AAAI PAIR (Plan, Activity, and Intent Recognition) Workshop}, pages={1--7}, year={2017}, location={San Francisco, CA, USA}, pdf={pubs/ahmed-et-al-aaai-pair-17.pdf}, supp={pubs/ahmed-et-al-aaai-pair-17.pdf}, pubtype={workshop}, abstract={Previous work in network analysis has focused on model- ing node roles in the graph. In this work, we introduce edge role discovery and develop a general framework for modeling edge roles in large networks. In addition, a general class of higher-order role discovery methods are proposed that lever- age features based on induced subgraphs (graphlets, motifs) for learning better and more useful roles. All methods are fast with a runtime that is linear in the number of edges and able to scale to large real-world networks via an effective paral- lel implementation. The experimental results demonstrate the utility of edge roles for network analysis tasks on a variety of graphs from various problem domains.}, keywords={role discovery, edge roles, edge role discovery, link roles, higher-order network analysis, higher-order network models, subgraph features, graphlets, network motifs, latent space models, transfer learning}, active={true}, } @inproceedings{ahmed16bigdata, title={{Estimation of Local Subgraph Counts}}, author={Nesreen K. Ahmed and Theodore L. Willke and Ryan A. Rossi}, booktitle={Proceedings of the IEEE International Conference on BigData}, pages={586-595}, year={2016}, location={Washington D.C.,USA}, external={https://dx.doi.org/10.1109/BigData.2016.7840651}, pdf={pubs/ahmed-et-al-BigData2016-Estimation-of-Local-Subgraph-Counts.pdf}, supp={pubs/ahmed-et-al-BigData2016-Estimation-of-Local-Subgraph-Counts.pdf}, acceptance={18.7%}, pubtype={conference}, abstract={Graphlets represent small induced subgraphs and are becoming increasingly important for a variety of applications. Despite the importance of the local subgraph (graphlet) counting problem, existing work focuses mainly on counting graphlets globally over the entire graph. These global counts have been used for tasks such as graph classification as well as for understanding and summarizing the fundamental structural patterns in graphs. In contrast, this work proposes an accurate, efficient, and scalable parallel framework for the more challenging problem of counting graphlets locally for a given edge or set of edges. The local graphlet counts provide a topologically rigorous characterization of the local structure surrounding an edge. The aim of this work is to obtain the count of every graphlet of size \emph{k} for each edge. The framework gives rise to efficient, parallel, and accurate unbiased estimation methods with provable error bounds, as well as exact algorithms for counting graphlets locally. Experiments demonstrate the effectiveness of the proposed exact and estimation methods on various datasets. In particular, the exact methods show strong scaling results (11--16x on 16 cores). Moreover, our estimation framework is accurate with error less than 5\% on average.}, keywords={Graphlets, subgraph counts, local motif counts, edge graphlet counts, estimation of graphlet counts, statistical estimation, estimation methods, relational learning, link classification, parallel algorithms}, } @article{ahmed2016kais, author = {Nesreen K. Ahmed and Jennifer Neville and Ryan A. Rossi and Nick Duffield and Theodore L. Willke}, title = {Graphlet Decomposition: Framework, Algorithms, and Applications}, journal = {Knowledge and Information Systems (KAIS)}, pages = {689--722}, url = {http://nesreenahmed.com/graphlets}, code = {http://github.com/nkahmed/pgd}, year = {2016}, pubtype={journal}, preprint={https://arxiv.org/abs/1506.04322}, pdf={pubs/ahmed-et-al-kais16.pdf}, external={https://link.springer.com/article/10.1007/s10115-016-0965-5}, doi={https://doi.org/10.1007/s10115-016-0965-5}, keywords={Graphlets, motifs, Subgraph patterns, Induced subgraphs, Graphlet decomposition, Graphlet-based network properties, Network Motifs, Induced subgraph statistics, parallel algorithm, biological networks, social networks, network alignment, graph kernels, machine learning}, abstract={From social science to biology, numerous applications often rely on graphlets for intuitive and meaningful characterization of networks. While graphlets have witnessed a tremendous success and impact in a variety of domains, there has yet to be a fast and e!cient framework for computing the frequencies of these subgraph patterns. However, existing methods are not scalable to large networks with billions of nodes and edges. In this paper, we propose a fast, e!cient, and parallel framework as well as a family of algorithms for counting k-node graphlets. The proposed framework leverages a number of theoretical combinatorial arguments that allow us to obtain significant improvement on the scalability of graphlet counting. For each edge, we count a few graphlets and obtain the exact counts of others in constant time using the combinatorial arguments. On a large collection of 300+ networks from a variety of domains, our graphlet counting strategies are on average 460x faster than existing methods. This brings new opportunities to investigate the use of graphlets on much larger networks and newer applications as we show in the experiments. To the best of our knowledge, this paper provides the largest graphlet computations to date.} } @inproceedings{rossi16rsm, title={{Relational Similarity Machines}}, author={Ryan A. Rossi and Rong Zhou and Nesreen K. Ahmed}, booktitle={Proceedings of the 12th International Workshop on Mining and Learning with Graphs (MLG)}, pages={1--8}, year={2016}, pdf={pubs/rossi-et-al-rsm-mlg16.pdf}, supp={pubs/rsm-mlg16-arxiv.pdf}, pubtype={workshop}, abstract={This paper proposes Relational Similarity Machines (RSM): a fast, accurate, and flexible relational learning framework for supervised and semi-supervised learning tasks. Despite the importance of relational learning, most existing methods are unable to handle large noisy attributed networks with low or even modest levels of relational autocorrelation. Furthermore, they also have issues with efficiency, accuracy, scalability, and flexibility. For instance, many existing methods perform poorly for multi-class classification problems, graphs that are sparsely labeled, or network data with low relational autocorrelation. In contrast, the proposed relational learning framework is designed to be (i) fast for learning and inference at real-time interactive rates, and (ii) flexible for a variety of learning settings, constraints, and application domains. The experiments demonstrate the effectiveness of RSM for a variety of tasks and data.}, active={true} } @inproceedings{rossi16cikm, title={{Leveraging Multiple GPUs and CPUs for Graphlet Counting in Large Networks}}, author={Ryan A. Rossi and Rong Zhou}, booktitle={ACM International Conference on Information and Knowledge Management (CIKM)}, pages={1783--1792}, year={2016}, pdf={pubs/rossi-et-al-cikm16-Hybrid-Multi-GPU-CPU-Graphlets.pdf}, slides={talks/CIKM16-talk-slides.pdf}, slideshare={http://www.slideshare.net/slideshow/embed_code/key/joiWlNwZHcN03v}, pubtype={conference}, abstract={Massively parallel architectures such as the GPU are becoming increasingly important due to the recent proliferation of data. In this paper, we propose a key class of hybrid parallel graphlet algorithms that leverages multiple CPUs and GPUs simultaneously for computing k-vertex induced subgraph statistics (called graphlets). In addition to the hybrid multi-core CPU-GPU framework, we also investigate single GPU methods (using multiple cores) and multi-GPU methods that leverage all available GPUs simultaneously for computing induced subgraph statistics. Both methods leverage GPU devices only, whereas the hybrid multi-core CPU-GPU framework leverages all available multi-core CPUs and multiple GPUs for computing graphlets in large networks. Compared to recent approaches, our methods are orders of magnitude faster, while also more cost effective enjoying superior performance per capita and per watt. In particular, the methods are up to 300+ times faster than a recent state-of-the-art method. To the best of our knowledge, this is the first work to leverage multiple CPUs and GPUs simultaneously for computing induced subgraph statistics.}, keywords={Graphlets, statistical estimation, subgraph counts, induced subgraphs, network motifs, graphlet decomposition, GPU computing, multi-GPU, heterogeneous computing, parallel algorithms, graph mining, role learning, relational learning, graph classification, relational classification, prediction, graph kernels, feature learning, node and edge embedding}, active={true} } @inproceedings{ahmed16bigmine, title={{Exact and Estimation of Local Edge-centric Graphlet Counts}}, author={Nesreen Ahmed and Ted Willke and Ryan A. Rossi}, booktitle={KDD BigMine}, pages={16}, year={2016}, pdf={pubs/ahmed-et-al-kdd-bigmine16.pdf}, pubtype={}, abstract={Graphlets represent small induced subgraphs and are becoming increasingly important for a variety of applications. Despite the importance of the local graphlet problem, existing work focuses mainly on counting graphlets globally over the entire graph. These global counts have been used for tasks such as graph classification as well as for understanding and summarizing the fundamental structural patterns in graphs. In contrast, this work proposes a flexible, efficient, and scalable parallel framework for the more challenging problem of counting graphlets locally for a given edge or set of edges.The local graphlet counts provide a topologically rigorous characterization of the local structure surrounding an edge. The aim of this work is to obtain the count of every graphlet of size k ∈ {3, 4} for each edge. The framework gives rise to efficient, parallel, and accurate unbiased estimation methods as well as exact graphlet algorithms for counting graphlets locally. Experiments demonstrate the effectiveness of the proposed exact and estimation methods.}, keywords={Graphlets, local graphlet counts, edge graphlet counts, edge features, statistical estimation, induced subgraphs, motifs, network analysis, statistical relational learning, link classification, parallel algorithms, subgraph counts, induced subgraphs, network motifs, graphlet decomposition, node embedding, edge embedding, feature learning}, active={true} } @inproceedings{rossi16factorization, title={{Parallel Collective Factorization for Modeling Large Heterogeneous Networks}}, author={Ryan A. Rossi and Rong Zhou}, booktitle={Social Network Analysis and Mining (SNAM)}, pages={30}, year={2016}, pdf={pubs/rossi2016-PCMF.pdf}, preprint={rossi-et-al-pcmf-snam16-preprint.pdf}, external={https://dx.doi.org/10.1007/s13278-016-0349-6}, pubtype={journal}, abstract={Relational learning methods for heterogeneous network data are becoming increasingly important for many real-world applications. However, existing relational learning approaches are sequential, inefficient, unable to scale to large heterogeneous networks, as well as many other limitations related to convergence, parameter tuning, etc. In this paper, we propose Parallel Collective Matrix Factorization (PCMF) that serves as a fast and flexible framework for joint modeling of a variety of heterogeneous network data. The PCMF learning algorithm solves for a single parameter given the others, leading to a parallel scheme that is fast, flexible, and general for a variety of relational learning tasks and heterogeneous data types. The proposed approach is carefully designed to be (a) efficient for large heterogeneous networks (linear in the total number of observations from the set of input matrices), (b) flexible as many components are interchangeable and easily adaptable, and (c) effective for a variety of applications as well as for different types of data. The experiments demonstrate the scalability, flexibility, and effectiveness of PCMF for a variety of relational modeling tasks. In particular, PCMF outperforms a recent state-of-the-art approach in runtime, scalability, and prediction quality. Finally, we also investigate variants of PCMF for serving predictions in a real-time streaming fashion.}, keywords={Recommender systems, missing value estimation, matrix completion, relational learning, low rank approximation, parallelization, scalable graph models, matrix factorization, collective factorization, coupled matrix-tensor factorization, cyclic coordinate descent, heterogeneous networks, prediction, social networks, link prediction, role discovery, network analysis}, active={true} } @inproceedings{ahmed2015icdm, title={Efficient Graphlet Counting for Large Networks}, author={Nesreen K. Ahmed and Jennifer Neville and Ryan A. Rossi and Nick Duffield}, booktitle={ICDM}, pages={1--10}, year={2015}, external={https://dx.doi.org/10.1109/ICDM.2015.141}, pubtype={conference}, abstract={From social science to biology, numerous applications often rely on graphlets for intuitive and meaningful characterization of networks at both the global macro-level as well as the local micro-level. While graphlets have witnessed a tremendous success and impact in a variety of domains, there has yet to be a fast and efficient approach for computing the frequencies of these subgraph patterns. However, existing methods are not scalable to large networks with millions of nodes and edges, which impedes the application of graphlets to new problems that require large-scale network analysis. To address these problems, we propose a fast, efficient, and parallel algorithm for counting graphlets up to size k=4-nodes that take only a fraction of the time to compute when compared with the current methods used. The proposed graphlet counting algorithms leverages a number of proven combinatorial arguments for different graphlets. For each edge, we count a few graphlets, and with these counts along with the combinatorial arguments, we obtain the exact counts of others in constant time. On a large collection of 300+ networks from a variety of domains, our graphlet counting strategies are on average 460x faster than current methods. This brings new opportunities to investigate the use of graphlets on much larger networks and newer applications as we show in the experiments. To the best of our knowledge, this paper provides the largest graphlet computations to date as well as the largest systematic investigation on over 300+ networks from a variety of domains.}, keywords={graph theory, mathematics computing, parallel algorithms, graphlet counting algorithm, graphlet counting strategy, large-scale network analysis, parallel algorithm, social science, Algorithm design and analysis, Frequency-domain analysis, Kernel, Peer-to-peer computing, Scalability, Systematics, graph classification, graph kernel, graphlet, motif counting, parallel method, visual analytics}, pdf={pubs/ahmed-et-al-icdm2015.pdf}, url = {http://github.com/nkahmed/pgd}, code = {http://github.com/nkahmed/pgd}, } @inproceedings{rossi2016aaai, author = {Ryan Rossi and Rong Zhou}, title = {Toward Interactive Relational Learning}, booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence}, pages = {4383--4384}, year = {2016}, pdf={pubs/rossi-et-al-toward-iRML.pdf}, poster={poster/AAAI16-Interactive-RML.pdf}, abstract={This paper introduces the Interactive Relational Machine Learning (iRML) paradigm in which users interactively design relational models by specifying the various components, constraints, and relational data representation, as well as perform evaluation, analyze errors, and make adjustments and refinements in a closed-loop. iRML requires fast real-time learning and inference methods capable of interactive rates. Methods are investigated that enable direct manipulation of the various components of the RML method. Visual representation and interaction techniques are also developed for exploring the space of relational models and the trade-offs of the various components and design choices.}, keywords={interactive machine learning, interactive relational learning}, pubtype={conference}, } @article{nr-sigkdd16, title = {An Interactive Data Repository with Visual Analytics}, author = {Ryan A. Rossi and Nesreen K. Ahmed}, journal = {SIGKDD Explor.}, volume = {17}, number = {2}, month = {Feb}, year = {2016}, issn = {1931-0145}, pages = {37--41}, url={http://networkrepository.com}, external = {https://dx.doi.org/10.1145/2897350.2897355}, publisher = {ACM}, pdf={pubs/rossi-et-al-sigkdd-expl.pdf}, poster={poster/Interactive-Network-Repository-Visual-Analytics.pdf}, external={http://dl.acm.org/citation.cfm?doid=2897350.2897355}, abstract = {Scientific data repositories have historically made data widely accessible to the scientific community, and have led to better research through comparisons, reproducibility, as well as further discoveries and insights. Despite the growing importance and utilization of data repositories in many scientific disciplines, the design of existing data repositories has not changed for decades. In this paper, we revisit the current design and envision interactive data repositories, which not only make data accessible, but also provide techniques for interactive data exploration, mining, and visualization in an easy, intuitive, and free-flowing manner.}, pubtype={journal}, } @inproceedings{rossi2015dsaa-pcmf, author={Ryan A. Rossi and Rong Zhou}, booktitle={IEEE International Conference on Data Science and Advanced Analytics (DSAA)}, title={Scalable Relational Learning for Large Heterogeneous Networks}, year={2015}, pages={1-10}, pubtype={conference}, external={https://dx.doi.org/10.1109/DSAA.2015.7344799}, abstract={Relational models for heterogeneous network data are becoming increasingly important for many real-world applications. However, existing relational learning approaches are not parallel, have scalability issues, and thus unable to handle large heterogeneous network data. In this paper, we propose Parallel Collective Matrix Factorization (PCMF) that serves as a fast and flexible framework for joint modeling of large heterogeneous networks. The PCMF learning algorithm solves for a single parameter given the others, leading to a parallel scheme that is fast, flexible, and general for a variety of relational learning tasks and heterogeneous data types. The proposed approach is carefully designed to be (a) efficient for large heterogeneous networks (linear in the total number of observations from the set of input matrices), (b) flexible as many components are interchangeable and easily adaptable, and (c) effective for a variety of applications as well as for different types of data. The experiments demonstrate the scalability, flexibility, and effectiveness of PCMF. For instance, we show that PCMF outperforms a recent state-of-the-art parallel approach in runtime, scalability, and prediction quality. Finally, the effectiveness of PCMF is shown on a number of relational learning tasks such as serving predictions in a realtime streaming fashion.}, keywords={data handling, learning (artificial intelligence), matrix decomposition, PCMF learning algorithm, heterogeneous network data, large heterogeneous networks, parallel collective matrix factorization, relational models, scalable relational learning, serving predictions, Data models, Heterogeneous networks, Prediction algorithms, Recommender systems, Runtime, Scalability, Sparse matrices}, doi={https://dx.doi.org/10.1109/DSAA.2015.7344799}, month={Oct}, pdf={pubs/rossi-zhou-pcmf-dsaa15.pdf}, location={Paris, France}, } @inproceedings{ahmed-icwsm15, title = {Interactive Visual Graph Analytics on the Web}, author = {Nesreen K. Ahmed and Ryan A. Rossi}, booktitle = {International AAAI Conference on Web and Social Media (ICWSM)}, pages={566--569}, year={2015}, pubtype={conference}, pdf={pubs/ahmed-icwsm15.pdf}, external={http://www.aaai.org/ocs/index.php/ICWSM/ICWSM15/paper/view/10535}, keywords = {Visual Analytics, Visual Graph Analytics, Web-based Visual Analytics, Network Visualization, Interactive Visualization, Interactive Graph Mining, Interactive Graph Generation, Exploratory Analysis, Dynamic Network Analysis, Multi-level Visual Analytics}, abstract = {We present a web-based network visual analytics platform called GraphVis that combines interactive visualizations with analytic techniques to reveal important patterns and insights for sense making, reasoning, and decision-making. The platform is designed with simplicity in mind and allows users to visualize and explore networks in seconds with a simple drag-and-drop of a graph file into the web browser. GraphVis is fast and flexible, web-based, requires no installation, while supporting a wide range of graph formats as well as state-of-the-art visualization and analytic techniques. In particular, the multi-level network analysis engine of GraphVis gives rise to a variety of new possibilities for exploring, analyzing, and understanding complex networks interactively in real-time. Finally, we also highlight other key aspects including filtering, querying, ranking, manipulating, exporting, partitioning (community/role discovery), as well as tools for dynamic network analysis and visualization, interactive graph generators (including two new block model approaches), and a variety of multi-level network analysis and statistical techniques.}, } @inproceedings{nr-aaai15, title = {The Network Data Repository with Interactive Graph Analytics and Visualization}, author={Ryan A. Rossi and Nesreen K. Ahmed}, booktitle = {Proceedings of the Twenty-Ninth AAAI Conference on Artificial Intelligence (AAAI)}, url={{http://networkrepository.com}}, year={2015}, abbr={AAAI}, pdf={pubs/aaai15-nr.pdf}, pubtype={conference}, abstract={Network Repository (NR) is the first interactive data repository with a web-based platform for visual interactive analytics. Unlike other data repositories (e.g., UCI ML Data Repository, and SNAP), the network data repository (networkrepository.com) allows users to not only download, but to interactively analyze and visualize such data using our web-based interactive graph analytics platform. Users can in real-time analyze, visualize, compare, and explore data along many different dimensions. The aim of NR is to make it easy to discover key insights into the data extremely fast with little effort while also providing a medium for users to share data, visualizations, and insights. Other key factors that differentiate NR from the current data repositories is the number of graph datasets, their size, and variety. While other data repositories are static, they also lack a means for users to collaboratively discuss a particular dataset, corrections, or challenges with using the data for certain applications. In contrast, NR incorporates many social and collaborative aspects that facilitate scientific research, e.g., users can discuss each graph, post observations, and visualizations.}, poster={poster/Interactive-Network-Repository-Visual-Analytics.pdf}, location = {}, } @article{rossi2015pmc-sisc, title={Parallel Maximum Clique Algorithms with Applications to Network Analysis}, author={Ryan A. Rossi and David F. Gleich and Assefaw H. Gebremedhin}, journal={SIAM Journal on Scientific Computing (SISC)}, volume={37}, number={5}, pages={28}, year={2015}, publisher={Society for Industrial and Applied Mathematics (SIAM)}, pdf={pubs/rossi-et-al-sisc2015-preprint.pdf}, url={http://github.com/ryanrossi/pmc}, code={http://github.com/ryanrossi/pmc}, supp={http://maxcliques.com}, pubtype={journal}, abstract={We present a fast, parallel maximum clique algorithm for large sparse graphs that is designed to exploit characteristics of social and information networks. The method exhibits a roughly linear runtime scaling over real-world networks ranging from a thousand to a hundred million nodes. In a test on a social network with 1.8 billion edges, the algorithm finds the largest clique in about 20 minutes. At its heart the algorithm employs a branch-and-bound strategy with novel and aggressive pruning techniques. The pruning techniques include the combined use of core numbers of vertices along with a good initial heuristic solution to remove the vast majority of the search space. In addition, the exploration of the search tree is parallelized. During the search, processes immediately communicate changes to upper and lower bounds on the size of maximum clique. This exchange of information occasionally results in a super-linear speedup because tasks with large search spaces can be pruned by other processes. We demonstrate the impact of the algorithm on applications using two different network analysis problems: computation of temporal strong components in dynamic networks and determination of compression-friendly ordering of nodes of massive networks.}, external={http://epubs.siam.org/doi/abs/10.1137/14100018X}, keywords={maximum clique, combinatorial optimization, temporal strong components, parallel maximum clique algorithms, branch-and-bound, network analysis, temporal strong components, graph compression}, } @article{rossi2015roles, title={{Role Discovery in Networks}}, author={Ryan A. Rossi and Nesreen K. Ahmed}, journal={IEEE Transactions on Knowledge and Data Engineering (TKDE)}, abbr={TKDE}, volume={27}, number={4}, pages={1112--1131}, year={2015}, publisher={IEEE}, pdf={pubs/rossi-ahmed-TKDE2014.pdf}, pubtype={journal}, preprint={http://arxiv.org/abs/1405.7134}, doi={https://dx.doi.org/10.1109/TKDE.2014.2349913}, external={https://dx.doi.org/10.1109/TKDE.2014.2349913}, abstract={Roles represent node-level connectivity patterns such as star-center, star-edge nodes, near-cliques or nodes that act as bridges to different regions of the graph. Intuitively, two nodes belong to the same role if they are structurally similar. Roles have been mainly of interest to sociologists, but more recently, roles have become increasingly useful in other domains. Traditionally, the notion of roles were defined based on graph equivalences such as structural, regular, and stochastic equivalences. We briefly revisit these early notions and instead propose a more general formulation of roles based on the similarity of a feature representation (in contrast to the graph representation). This leads us to propose a taxonomy of three general classes of techniques for discovering roles that includes(i) graph-based roles, (ii) feature-based roles, and (iii) hybrid roles. We also propose a flexible framework for discovering roles using the notion of similarity on a feature-based representation. The framework consists of two fundamental components: (a) role featureconstruction and (b) role assignment using the learned feature representation. We discuss the different possibilities for discoveringfeature-based roles and the tradeoffs of the many techniques for computing them. Finally, we discuss potential applications and future directions and challenges.}, keywords={Communities, Computational modeling, Social network modeling, Stochastic processes, Correlation, Bridges,unsupervised learning, Roles, role discovery, role learning, feature-based roles, structural similarity, model selection, minimum description length, graph similarity, network positions, membership models, machine learning, relational learning, regularization, tensor factorization, non-negative matrix factorization, big data, parallel algorithms, high-performance computing, interactive machine learning, graph kernels}, } @article{rossi2014coloring, title={Coloring Large Complex Networks}, author={Ryan A. Rossi and Nesreen K. Ahmed}, journal={Social Network Analysis and Mining}, eid={228}, volume={4}, number={1}, pages={37}, year={2014}, pubtype={journal}, pdf={pubs/rossi-ahmed-SNAM2014.pdf}, external={https://dx.doi.org/10.1007/s13278-014-0228-y}, abstract={Given a large social or information network, how can we partition the vertices into sets (i.e., colors) such that no two vertices linked by an edge are in the same set while minimizing the number of sets used. Despite the obvious practical importance of graph coloring, existing works have not systematically investigated or designed methods for large complex networks. In this work, we develop a unified framework for coloring large complex networks that consists of two main coloring variants that effectively balances the tradeoff between accuracy and efficiency. Using this framework as a fundamental basis, we propose coloring methods designed for the scale and structure of complex networks. In particular, the methods leverage triangles, triangle-cores, and other egonet properties and their combinations. We systematically compare the proposed methods across a wide range of networks (e.g., social, web, biological networks) and find a signifi- cant improvement over previous approaches in nearly all cases. Additionally, the solutions obtained are nearly optimal and sometimes provably optimal for certain classes of graphs (e.g., collaboration networks). We also propose a parallel algorithm for the problem of coloring neighborhood subgraphs and make several key observations. Overall, the coloring methods are shown to be (1) accurate with solutions close to optimal, (2) fast and scalable for large networks, and (3) flexible for use in a variety of applications.}, keywords={Network coloring, graph coloring, unified coloring framework, greedy methods, neighborhood coloring, social networks, combinatorial optimization, NP-hard} } @inproceedings{rossi2014pakdd, author={Ryan A. Rossi}, title={Fast Triangle Core Decomposition for Mining Large Graphs}, booktitle={Advances in Knowledge Discovery and Data Mining (PAKDD)}, year={2014}, publisher={Springer}, pages={310--322}, external={https://dx.doi.org/10.1007/978-3-319-06608-0_26}, location={Tainan, Taiwan}, pubtype={conference}, pdf={pubs/rossi-fast-triangle-core-ktruss.pdf}, abstract={Large triangle cores represent dense subgraphs for which each edge has at least k − 2 triangles (same as cliques). This paper presents a fast algorithm for computing the triangle core decomposition on big graphs. The proposed triangle core algorithm adapts both the computations and representation based on the properties of the graph. In addition, we develop a fast edge-based parallel triangle counting algorithm, which lies at the heart of the triangle core decomposition. The proposed algorithm is orders of magnitude faster than the currently available approach. We also investigate and propose fast methods for two variants of the triangle core problem: computing only the top-k triangle cores fast and finding the maximum triangle core number of the graph. The experiments demonstrate the scalability and effectiveness of our approach on 150+ networks with up to 1.8 billion-edges. Further, we apply the proposed methods for graph mining tasks including finding dense subgraphs, temporal strong components, and maximum cliques.}, keywords={triangle cores, k-truss, parallel triangle counting, triangle core decomposition, k-truss decomposition, triangle-core, parallel triangle counting, maximum clique, temporal strong components, triangle-core ordering}, } @inproceedings{rossi2013topology, title={A Multi-Level Approach for Evaluating Internet Topology Generators}, author={Ryan A. Rossi and Sonia Fahmy and Nilothpal Talukder}, booktitle={Networking}, pages={1--9}, year={2013}, slides={talks/ifip2013-slides.pdf}, external={http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6663527}, pdf={pubs/topology-ifip13.pdf}, pubtype={conference}, abstract={The topology of a network (connectivity of autonomous systems (ASes) or routers) has significant implications on the design of protocols and applications, and on the placement of services and data centers. Researchers and practitioners alike need realistic topologies for their simulation, emulation, and testbed experiments. In this work, we propose a multilevel framework for analyzing Internet topologies and their evolution. Our multi-level framework includes novel measures, evaluation strategies, and techniques for automatically learning a representative set of graph measures. We apply our framework to analyze topologies from two recent topology generators, Orbis and WIT, according to how well they achieve their advertised objectives. The generated topologies are compared to a set of benchmark datasets that approximate different views of the Internet in the data (trace-route measurements), control (BGP), and management (WHOIS) planes. Our results demonstrate key limitations of popular generators, and show that the recent Internet clustering coefficient and average distance are not timeinvariant as assumed by many models. Additionally, we develop a taxonomy of topology generators, and identify key challenges in topology modeling.}, keywords={synthetic graph generation, Internet AS, topology generators, synthetic graphs, Internet evolution, multi-level evaluation framework, graph measures, node measures, link measures} } @inproceedings{rossi2014pmc-www, title={Fast Maximum Clique Algorithms for Large Graphs}, author={Ryan A. Rossi and David F. Gleich and Assefaw H. Gebremedhin and Mostofa A. Patwary}, booktitle={Proceedings of the 23rd International Conference on World Wide Web (WWW)}, year={2014}, pubtype={conference}, pdf={pubs/www14-pmc-rossi.pdf}, supp={http://arxiv.org/pdf/1302.6256v2.pdf}, preprint={http://arxiv.org/pdf/1302.6256v2.pdf}, code={http://github.com/ryanrossi/pmc}, poster={poster/pmc-poster-clique-is-easy.pdf}, abstract={We propose a fast, parallel maximum clique algorithm for large sparse graphs that is designed to exploit characteristics of social and information networks. Despite clique’s status as an NP-hard problem with poor approximation guarantees, our method exhibits nearly linear runtime scaling over realworld networks ranging from 1000 to 100 million nodes. In a test on a social network with 1.8 billion edges, the algorithm finds the largest clique in about 20 minutes. Key to the efficiency of our algorithm are an initial heuristic procedure that finds a large clique quickly and a parallelized branch and bound strategy with aggressive pruning and ordering techniques. We use the algorithm to compute the largest temporal strong components of temporal contact networks} } @inproceedings{rossi2013trianglecores, title={Triangle Core Decomposition and Maximum Cliques}, author={Ryan A. Rossi and David F. Gleich and Assefaw H. Gebremedhin}, booktitle={SIAM Workshop on Network Science}, pages={1--2}, year={2013}, pubtype={workshop}, pdf={pubs/triangle-cores-and-maximum-cliques.pdf}, abstract={Consider a graph G = (V, E). A k-core of G is a maximal induced subgraph of G where each vertex has degree at least k. There is a linear time O(|E|+|V |) time algorithm to compute the maximum k such that a vertex is in a k-core for all vertices in the graph [1]. Because of this efficient algorithm, and a simple, but often powerful, interpretation, k-cores are frequently utilized to study modern networks. Important k-core related quantities include the size of the 2-core compared with the graph, the distribution of k-core sizes, the largest k-core, and many similar quantities. In particular, the maximum value of k such that there is a k − 1-core in G is known as the coloring number of a graph and provides an upper-bound on the largest clique in G. In this paper, we will discuss a recently proposed generalization of k-cores to a triangle motif (k-truss), refine a procedure to compute them to enable computations on large graphs, and use the relationship between largest triangle core to confirm that a heuristic maximum clique finder discovers the optimal solution for many social and information networks.}, } @article{rossi2014dynamical, title={A Dynamical System for PageRank with Time-Dependent Teleportation}, author={David F. Gleich and Ryan A. Rossi}, journal={Internet Mathematics}, volume = {10}, number = {1-2}, pages={188--217}, year={2014}, pubtype={journal}, pdf={pubs/dynamic-pagerank.pdf}, external={https://dx.doi.org/10.1080/15427951.2013.814092}, code={https://github.com/ryanrossi/dynamic-pagerank}, abstract={We propose a dynamical system that captures changes to the network centrality of nodes as external interest in those nodes varies. We derive this system by adding time-dependent teleportation to the PageRank score. The result is not a single set of importance scores, but rather a time-dependent set. These can be converted into ranked lists in a variety of ways, for instance, by taking the largest change in the importance score. For an interesting class of dynamic teleportation functions, we derive closed-form solutions for the dynamic PageRank vector. The magnitude of the deviation from a static PageRank vector is given by a PageRank problem with complex-valued teleportation parameters. Moreover, these dynamical systems are easy to evaluate. We demonstrate the utility of dynamic teleportation on both the article graph of Wikipedia, where the external interest information is given by the number of hourly visitors to each page, and the Twitter social network, where external interest is the number of tweets per month. For these problems, we show that using information from the dynamical system helps improve a prediction task and identify trends in the data.}, keywords={dynamical system, dynamic PageRank, causality, time series prediction, dynamic networks}, } @inproceedings{rossi2013modeling, title={Modeling Dynamic Behavior in Large Evolving Graphs}, author={Ryan A. Rossi and Brian Gallagher and Jennifer Neville and Keith Henderson}, booktitle={Proceedings of the Sixth ACM International Conference on Web Search and Data Mining (WSDM)}, pages={667--676}, year={2013}, pdf={pubs/wsdm13-dbmm.pdf}, poster={poster/wsdm13-dbmm-poster-rossi.pdf}, slides={talks/wsdm13-dbmm-rossi.pdf}, abstract={Given a large time-evolving graph, how can we model and characterize the temporal behaviors of individual nodes (and network states)? How can we model the behavioral transition patterns of nodes? We propose a temporal behavior model that captures the "roles" of nodes in the graph and how they evolve over time. The proposed dynamic behavioral mixed-membership model (DBMM) is scalable, fully automatic (no user-defined parameters), non-parametric/data-driven (no specific functional form or parameterization), interpretable (identifies explainable patterns), and flexible (applicable to dynamic and streaming networks). Moreover, the interpretable behavioral roles are generalizable and computationally efficient. We applied our model for (a) identifying patterns and trends of nodes and network states based on the temporal behavior, (b) predicting future structural changes, and (c) detecting unusual temporal behavior transitions. The experiments demonstrate the scalability, flexibility, and effectiveness of our model for identifying interesting patterns, detecting unusual structural transitions, and predicting the future structural changes of the network and individual nodes.}, location = {Rome, Italy}, external={https://doi.acm.org/10.1145/2433396.2433479}, doi = {10.1145/2433396.2433479}, acmid = {2433479}, publisher = {ACM}, keywords = {dynamic mixed-membership models, dynamic network analysis, dynamic roles, graph mining, scalable models}, pubtype={conference}, } @article{rossi2012transforming, title={Transforming Graph Data for Statistical Relational Learning}, author={Ryan A. Rossi and Luke K. McDowell and David W. Aha and Jennifer Neville}, journal={Journal of Artificial Intelligence Research (JAIR)}, volume={45}, pages={363--441}, year={2012}, publisher={AAAI Press}, pubtype={journal}, pdf={pubs/transforming-srl-jair.pdf}, external={http://jair.org/pubs/paper3659.html}, abstract={Relational data representations have become an increasingly important topic due to the recent proliferation of network datasets (e.g., social, biological, information networks) and a corresponding increase in the application of Statistical Relational Learning (SRL) algorithms to these domains. In this article, we examine and categorize techniques for transforming graph-based relational data to improve SRL algorithms. In particular, appropriate transformations of the nodes, links, and/or features of the data can dramatically affect the capabilities and results of SRL algorithms. We introduce an intuitive taxonomy for data representation transformations in relational domains that incorporates link transformation and node transformation as symmetric representation tasks. More specifically, the transformation tasks for both nodes and links include (i) predicting their existence, (ii) predicting their label or type, (iii) estimating their weight or importance, and (iv) systematically constructing their relevant features. We motivate our taxonomy through detailed examples and use it to survey competing approaches for each of these tasks. We also discuss general conditions for transforming links, nodes, and features. Finally, we highlight challenges that remain to be addressed.}, keywords={Network Representation Learning, Statistical Relational Learning, relational data, relational representation learning, representation learning, graph-based learning, graph-based representation learning, link prediction, link weighting, node prediction, node weighting, feature construction, relational feature learning, graph transformations} } @article{rossi2012dynamic, author = {Ryan A. Rossi and David F. Gleich}, title = {Dynamic PageRank using Evolving Teleportation}, journal = {Algorithms and Models for the Web Graph}, year = {2012}, editor = {Anthony Bonato and Jeannette Janssen}, volume = {7323}, series = {Lecture Notes in Computer Science}, pages = {126--137}, publisher = {Springer}, pdf={pubs/rossi-gleich-dynamic-pagerank.pdf}, external={http://www.springer.com/us/book/9783642305405}, slides={http://www.slideshare.net/ryanrossi/dynamic-pagerank-using-evolving-teleportation}, slideshare={http://www.slideshare.net/slideshow/embed_code/key/arXCeFyO8baYSV}, code={http://github.com/ryanrossi/dynamic-pagerank}, supp={dynamic_pagerank.php}, abstract={The importance of nodes in a network constantly fluctuates based on changes in the network structure as well as changes in external interest. We propose an evolving teleportation adaptation of the PageRank method to capture how changes in external interest influence the importance of a node. This framework seamlessly generalizes PageRank because the importance of a node will converge to the PageRank values if the external influence stops changing. We demonstrate the effectiveness of the evolving teleportation on the Wikipedia graph and the Twitter social network. The external interest is given by the number of hourly visitors to each page and the number of monthly tweets for each user.}, keywords={dynamic PageRank, influence modeling, node important, dynamic networks, temporal networks, node ranking, forecasting, multi-variate relational regression, Granger causality, causal link modeling} } @article{rossi2012relational-rep-learning, title={Transforming graph representations for statistical relational learning}, author={Ryan A. Rossi and Luke K. McDowell and David W. Aha and Jennifer Neville}, journal={arXiv:1204.0033}, year={2012}, pdf={pubs/rossi-et-al-relational-representation-learning.pdf}, preprint={https://arxiv.org/pdf/1204.0033.pdf}, external={https://arxiv.org/abs/1204.0033}, abstract={Relational data representations have become an increasingly important topic due to the recent proliferation of network datasets (e.g., social, biological, information networks) and a corresponding increase in the application of statistical relational learning (SRL) algorithms to these domains. In this article, we examine a range of representation issues for graph-based relational data. Since the choice of relational data representation for the nodes, links, and features can dramatically affect the capabilities of SRL algorithms, we survey approaches and opportunities for relational representation transformation designed to improve the performance of these algorithms. This leads us to introduce an intuitive taxonomy for data representation transformations in relational domains that incorporates link transformation and node transformation as symmetric representation tasks. In particular, the transformation tasks for both nodes and links include (i) predicting their existence, (ii) predicting their label or type, (iii) estimating their weight or importance, and (iv) systematically constructing their relevant features. We motivate our taxonomy through detailed examples and use it to survey and compare competing approaches for each of these tasks. We also discuss general conditions for transforming links, nodes, and features. Finally, we highlight challenges that remain to be addressed.}, keywords={Network Representation Learning, Statistical Relational Learning, relational data, relational representation learning, representation learning, graph-based learning, graph-based representation learning, link prediction, link weighting, node prediction, node weighting, feature construction, relational feature learning, graph transformations} } @article{rossi11representations, title={Representations and Ensemble Methods for Dynamic Relational Classification}, author={Ryan A. Rossi and Jennifer Neville}, journal={arXiv:1111.5312}, year={2011}, pdf={pubs/rossi-et-al-2011-dynamic-network-learning.pdf}, preprint={https://arxiv.org/pdf/1111.5312.pdf}, external={https://arxiv.org/abs/1111.5312}, keywords={Dynamic network representation learning, classification, dynamic network modeling, temporal networks, time-evolving graphs}, abstract={Temporal networks are ubiquitous and evolve over time by the addition, deletion, and changing of links, nodes, and attributes. Although many relational datasets contain temporal information, the majority of existing techniques in relational learning focus on static snapshots and ignore the temporal dynamics. We propose a framework for discovering temporal representations of relational data to increase the accuracy of statistical relational learning algorithms. The temporal relational representations serve as a basis for classification, ensembles, and pattern mining in evolving domains. The framework includes (1) selecting the time-varying relational components (links, attributes, nodes), (2) selecting the temporal granularity, (3) predicting the temporal influence of each time-varying relational component, and (4) choosing the weighted relational classifier. Additionally, we propose temporal ensemble methods that exploit the temporal-dimension of relational data. These ensembles outperform traditional and more sophisticated relational ensembles while avoiding the issue of learning the most optimal representation. Finally, the space of temporal-relational models are evaluated using a sample of classifiers. In all cases, the proposed temporal-relational classifiers outperform competing models that ignore the temporal information. The results demonstrate the capability and necessity of the temporal-relational representations for classification, ensembles, and for mining temporal datasets.}, } @inproceedings{rossi2012role, title={Role-Dynamics: Fast Mining of Large Dynamic Networks}, author={Ryan A. Rossi and Brian Gallagher and Jennifer Neville and Keith Henderson}, booktitle={Proceedings of the 21st International Conference Companion on World Wide Web (WWW)}, pages={997--1006}, year={2012}, organization={ACM}, pdf={pubs/rossi-www2012.pdf}, external={https://dx.doi.org/10.1145/2187980.2188234}, location={Lyon, France}, abstract={To understand the structural dynamics of a large-scale social, biological or technological network, it may be useful to discover behavioral roles representing the main connectivity patterns present over time. In this paper, we propose a scalable non-parametric approach to automatically learn the structural dynamics of the network and individual nodes. Roles may represent structural or behavioral patterns such as the center of a star, peripheral nodes, or bridge nodes that connect different communities. Our novel approach learns the appropriate structural “role” dynamics for any arbitrary network and tracks the changes over time. In particular, we uncover the specific global network dynamics and the local node dynamics of a technological, communication, and social network. We identify interesting node and network patterns such as stationary and non-stationary roles, spikes/steps in role-memberships (perhaps indicating anomalies), increasing/decreasing role trends, among many others. Our results indicate that the nodes in each of these networks have distinct connectivity patterns that are nonstationary and evolve considerably over time. Overall, the experiments demonstrate the effectiveness of our approach for fast mining and tracking of the dynamics in large networks. Furthermore, the dynamic structural representation provides a basis for building more sophisticated models and tools that are fast for exploring large dynamic networks.}, keywords={role discovery, dynamic networks, dynamic network analysis, anomaly detection, non-parametric model, large graphs, behavioral roles, structural dynamics} } @inproceedings{rossi2012dynamic-srl, title={Time-evolving Relational Classification and Ensemble Methods}, author={Ryan A. Rossi and Jennifer Neville}, booktitle={PAKDD}, pages={1--13}, year={2012}, publisher={Springer}, pdf={pubs/rossi-tenc-pakdd2012.pdf}, external={https://dx.doi.org/10.1007/978-3-642-30217-6_1}, slides={http://www.slideshare.net/ryanrossi/timeevolving-relational-classification-and-ensemble-methods}, slideshare={http://www.slideshare.net/slideshow/embed_code/key/ihSCHtI4gS3ahQ}, location={Kuala Lumpur, Malaysia}, pubtype={conference}, abstract={Relational networks often evolve over time by the addition, deletion, and changing of links, nodes, and attributes. However, accurately incorporating the full range of temporal dependencies into relational learning algorithms remains a challenge. We propose a novel framework for discovering temporal-relational representations for classi- fication. The framework considers transformations over all the evolving relational components (attributes, edges, and nodes) in order to accurately incorporate temporal dependencies into relational models. Additionally, we propose temporal ensemble methods and demonstrate their effectiveness against traditional and relational ensembles on two realworld datasets. In all cases, the proposed temporal-relational models outperform competing models that ignore temporal information.}, keywords={node classification, time-evolving relational classification, ensemble methods, temporal dependencies, dynamic network classification, classifier, representation learning} } @inproceedings{rossi2010modeling, title={Modeling the Evolution of Discussion Topics and Communication to Improve Relational Classification}, author={Ryan A. Rossi and Jennifer Neville}, booktitle={SIGKDD SOMA}, pages={89--97}, year={2010}, pdf={pubs/somaKDD2010.pdf}, slides={talks/temporal-relational-classifiers.pdf}, poster={talks/tenc-poster.pdf}, pubtype={workshop}, abstract={Textual analysis is one means by which to assess communication type and moderate the influence of network structure in predictive models of individual behavior. However, there are few methods available to incorporate textual content into time-evolving network models. In particular, modeling both the evolution of network topology and textual content change in time-varying communication data poses a difficult challenge. In this work, we propose a Temporally Evolving Network Classifier (TENC) to incorporate the influence of time-varying edges and temporally-evolving attributes in relational classification models. To facilitate this, we use an evolutionary latent topic approach to automatically discover and label communications between individuals in a network with their corresponding latent topic. The topics of the messages are incorporated into the TENC along with time-varying relationships and temporally-evolving attributes, using weighted, exponential kernel summarization. We evaluate the utility of the TENC on a real-world classification task, where the aim is to predict the effectiveness of a developer in the python open-source developer network. We take advantage of the textual content in developer emails and bug communications, which both evolve over time. The TENC paired with the latent topics significantly improves performance over the baseline classifiers that only take into account the static properties of the topics and communications. The results show that the TENC can be used to accurately model the complete-set of temporal dynamics in time-evolving communication networks.}, } @inproceedings{rossi2007crick, title={Cricks Hypothesis Revisited: The Existence of a Universal Coding Frame}, author={Jean-Louis Lassez and Ryan A. Rossi and Axel E. Bernal}, booktitle={AINAW}, volume={1}, pages={745--751}, year={2007}, location={Niagara Falls, Ontario, Canada}, note={Presented in the US, Russia, Japan, Thailand and Canada at various conferences and keynotes.}, pdf={pubs/AINA-BLSC2007_Universal_Coding_Frame.pdf}, abstract={In 1957 Crick hypothesized that the genetic code was a comma free code. This property would imply the existence of a universal coding frame and make the set of coding sequences a locally testable language. As the link between nucleotides and amino acids became better understood, it appeared clearly that the genetic code was not comma free. Crick then adopted a radically different hypothesis: the "frozen accident". However, the notions of comma free codes and locally testable languages are now playing a role in DNA Computing, while circular codes have been found as subsets of the genetic code. We revisit Crick's 1957 hypothesis in that context. We show that coding sequences from a wide variety of genes from the three domains, eukaryotes, prokaryotes and archaea, have a property of testable by fragments, which is an adaptation of the notion of local testability to DNA sequences. These results support the existence of a universal coding frame, as the frame of a coding sequence can be determined from one of its fragments, independently from the gene or the organism the coding sequence comes from.}, slides={slides/Existence_of_a_Universal_Coding_Frame_Khon_Kaen.pdf}, external={https://dx.doi.org/10.1109/AINAW.2007.138}, pubtype={conference}, } @article{lassez2008ranking, title={Ranking Links on the Web: Search and Surf Engines}, author={Jean-Louis Lassez and Ryan A. Rossi and Kumar Jeev}, journal={New Frontiers in Applied Artificial Intelligence (IEA/AIE)}, pages={199--208}, year={2008}, publisher={Springer}, abstract={The main algorithms at the heart of search engines have focused on ranking and classifying sites. This is appropriate when we know what we are looking for and want it directly. Alternatively, we surf, in which case ranking and classifying links becomes the focus. We address this problem using a latent semantic analysis of the web. This technique allows us to rate, suppress or create links giving us a version of the web suitable for surfing. Furthermore, we show on benchmark examples that the performance of search algorithms such as PageRank is substantially improved as they work on an appropriately weighted graph.}, pdf={pubs/AIEPoland2008_Ranking_Links.pdf}, external={https://dx.doi.org/10.1007/978-3-540-69052-8_21}, slides={talks/empty_proof_japan.pdf}, location={Wroc{\l}aw, Poland}, pubtype={conference}, keywords={Link estimation, latent modeling, link weighting, latent links, surf engines, link prediction, Singular Value Decomposition, latent semantic analysis, search algorithms} } @inproceedings{lassez2008signature, title={Signature based Intrusion Detection using Latent Semantic Analysis}, author={Jean-Louis Lassez and Ryan A. Rossi and Stephen Sheel and Srinivas Mukkamala}, booktitle={Proceedings of the IEEE International Joint Conference on Neural Networks (IJCNN)}, pages={1068--1074}, year={2008}, location={Hong Kong}, otherinfo={IEEE World Congress on Computational Intelligence}, pdf={pubs/WCCI2008_LSA_ID.pdf}, external={https://dx.doi.org/10.1109/IJCNN.2008.4633931}, abstract={We address the problem of selecting and extracting key features by using singular value decomposition and latent semantic analysis. As a consequence, we are able to discover latent information which allows us to design signatures for forensics and in a dual approach for real-time intrusion detection systems. The validity of this method is shown by using several automated classification algorithms (Maxim, SYM, LGP). Using the original data set we classify 99.86% of the calls correctly. After feature extraction we classify 99.68% of the calls correctly, while with feature selection we classify 99.78% of the calls correctly, justifying the use of these techniques in forensics. The signatures obtained after feature selection and extraction using LSA allow us to class 95.69% of the calls correctly with features that can be computed in real time. We use Support Vector Decision Function and Linear Genetic Programming for feature selection on a real data set generated on a live performance network that consists of probe and denial of service attacks. We find that the results reinforce our feature selection method.}, keywords={singular value decomposition, support vector machines, automated classification algorithms, feature selection, latent semantic analysis, linear genetic programming, real-time intrusion detection systems, signature based intrusion detection, cyber security, support vector decision function, classification algorithms, data mining, feature extraction, genetic programming, Intrusion detection}, pubtype={conference}, } @inproceedings{stamey2007dynamic, title={Client-side Dynamic Metadata in Web 2.0}, author={John Stamey and Jean-Louis Lassez and Daniel Boorn and Ryan A. Rossi}, booktitle={Proceedings of the 25th annual ACM International Conference on Design of Communication (SIGDOC)}, pages={155--161}, year={2007}, pdf={pubs/SIGDOC2007_Dynamic_Metadata.pdf}, external={http://dl.acm.org/citation.cfm?id=1297176}, pubtype={conference}, } @article{rossi2009latent, title={Latent Semantic Analysis of the Languages of Life}, author={Ryan A. Rossi}, journal={Computational Intelligence and Intelligent Systems}, pages={128--137}, year={2009}, publisher={Springer}, pdf={pubs/rossi-lsa-language-of-life.pdf}, external={https://dx.doi.org/10.1007/978-3-642-04962-0_15}, abstract={We use Latent Semantic Analysis as a basis to study the languages of life. Using this approach we derive techniques to discover latent relationships between organisms such as significant motifs and evolutionary features. Doubly Singular Value Decomposition is defined and the significance of this adaptation is demonstrated by finding a phylogeny of twenty prokaryotes. Minimal Killer Words are used to define families of organisms from negative information. The application of these words makes it possible to automatically retrieve the coding frame of a sequence from any organism.}, pubtype={conference}, } @inproceedings{powell2010scalable, title={A Scalable Image Processing Framework for Gigapixel Mars and Other Celestial Body Images}, author={Mark W. Powell and Ryan A. Rossi and Khawaja S. Shams}, booktitle={IEEE Aerospace}, pages={1--11}, year={2010}, pdf={pubs/SIPF-Mars-JPL-IEEE.pdf}, external={https://dx.doi.org/10.1109/AERO.2010.5446706}, abstract={The Mars Reconnaissance Orbiter's HiRISE (High Resolution Imaging Science Experiment) camera takes the largest images of the Martian surface. The image size is typically around 2.52 gigapixels. There is only a handful of software capable of doing a task as simple as reducing the size of the image by half and saving the result as a new image. The Scalable Image Processing Framework (SIPF) overcomes these issues by creating a generalized tile-based processing pipeline that loads only a small portion of the image into memory. This allows for the data in memory at any given time to become manageable. Image tiles are an intrinsic property that provides scalability and efficiency while processing images. Distributed computing technologies such as cloud computing can be applied naturally. A mathematical framework for scalable image operations is defined that provides insight into the scalable considerations needed with each class of operations. We also formalize the deferred execution design pattern and show how it is used as a basis for our implementation. The SIPF has the ability to perform a variety of scalable image operations such as cropping, rotation, scaling (bilinear and nearest neighbor interpolation), edge detection, sharpening, convolution (filters), brightness, contrast, and Gaussian blurring. The Scalable Image Processing Framework will be used to process incoming images from the Mars Exploration Rovers and eventually the Mars Science Laboratory. It will be integrated with the Maestro software (science visualization and planning tool). Maestro is used for the Mars Exploration Rover Mission and other celestial body exploratory missions.}, location={Big Sky, Montana}, keywords={Mars, astronomical image processing, planetary remote sensing, planetary surfaces, HiRISE camera, High Resolution Imaging Science Experiment, Mars reconnaissance orbiter, Martian surface, SIPF, cloud computing, computing technologies, gigapixel images, mathematical framework, planetary images, scalable image operations, scalable image processing framework, tile-based processing pipeline, Cameras, Distributed computing, High-resolution imaging, Image processing, Mars, Memory management, Pipelines, Reconnaissance, Scalability, Tiles}, pubtype={conference}, } @inproceedings{shams2010polyphony, title={Polyphony: A Workflow Orchestration Framework for Cloud Computing}, author={Khawaja S. Shams and Mark W. Powell and Tom M. Crockett and Jeffrey S. Norris and Ryan A. Rossi and Tom Soderstrom}, booktitle={10th IEEE/ACM International Conference on Cluster, Cloud and Grid Computing (CCGrid)}, pages={606--611}, year={2010}, note={Amazon AWS Case Study: NASA JPL’s Desert Research and Training}, pdf={pubs/Polyphony%20A%20Workflow%20Orchestration%20Framework%20for%20Cloud.pdf}, external={http://dl.acm.org/citation.cfm?id=1845207}, abstract={Cloud Computing has delivered unprecedented compute capacity to NASA missions at affordable rates. Missions like the Mars Exploration Rovers (MER) and Mars Science Lab (MSL) are enjoying the elasticity that enables them to leverage hundreds, if not thousands, or machines for short durations without making any hardware procurements. In this paper, we describe Polyphony, a resilient, scalable, and modular framework that efficiently leverages a large set of computing resources to perform parallel computations. Polyphony can employ resources on the cloud, excess capacity on local machines, as well as spare resources on the supercomputing center, and it enables these resources to work in concert to accomplish a common goal. Polyphony is resilient to node failures, even if they occur in the middle of a transaction. We will conclude with an evaluation of a production-ready application built on top of Polyphony to perform image-processing operations of images from around the solar system, including Mars, Saturn, and Titan.}, pubtype={conference}, } @inproceedings{stamey2009automatically, title={Automatically Identifying Relations in Privacy Policies}, author={John W. Stamey and Ryan A. Rossi}, booktitle={Proceedings of the 27th ACM International Conference on Design of Communication}, pages={233--238}, year={2009}, pdf={pubs/SIGDOC2009-rossi.pdf}, external={http://dl.acm.org/citation.cfm?id=1622041}, pubtype={conference}, } @inproceedings{rossi2013modeling-evol, title={Modeling the Evolution of the Internet Topology: A Multi-Level Evaluation Framework}, author={Ryan A. Rossi and Sonia Fahmy and Nilothpal Talukder}, booktitle={Tech. Report Purdue CS}, pages={1--10}, year={2013}, pubtype={tech-report}, pdf={pubs/modeling_internetAS_evolution.pdf}, abstract={The topology of a network (connectivity of autonomous systems (ASes) or routers) has significant implications on the design of protocols and applications, and on the placement of services and data centers. Researchers and practitioners alike are in constant need of realistic topologies for their simulation, emulation, and testbed experiments. In this work, we propose a multi-level framework for analyzing Internet topologies and their evolution. This multi-level approach includes novel metrics, evaluation strategies, and techniques for automatically learning a representative set of graph metrics. We apply our framework to analyze topologies from two recent topology generators, Orbis and WIT, according to how well they match their advertised objectives. The generated topologies are compared to a set of benchmark datasets that approximate different views of the Internet at the data (trace-route measurements), control (BGP), and management (WHOIS) planes. Our results demonstrate key limitations of popular generators, and show that the recent Internet clustering coefficient and average distance are not timeinvariant as assumed by many models. Additionally, we propose a taxonomy of topology generators, and identify challenges in topology modeling.}, keywords={topology, Internet AS, evolution, temporal networks, network evolution model} } @article{rossi2013parallel-cliques, title={A Fast Parallel Maximum Clique Algorithm for Large Sparse Graphs and Temporal Strong Components}, author={Ryan A. Rossi and David F. Gleich and Assefaw H. Gebremedhin and Mostofa A. Patwary}, journal={arXiv preprint arXiv:1302.6256}, pages={1--9}, year={2013}, pdf={pubs/parallel-maximum-clique-algorithm-for-large-sparse-graphs.pdf}, pubtype={tech-report}, abstract={We propose a fast, parallel, maximum clique algorithm for large, sparse graphs that is designed to exploit characteristics of social and information networks. We observe roughly linear runtime scaling over graphs between 1000 vertices and 100M vertices. In a test with a 1.8 billion-edge social network, the algorithm finds the largest clique in about 20 minutes. For social networks, in particular, we found that using the core number of a vertex in combination with a good heuristic clique finder efficiently removes the vast majority of the search space. In addition, we parallelize the exploration of the search tree. In the algorithm, processes immediately communicate changes to upper and lower bounds on the size of maximum clique, which occasionally results in a super-linear speedup because vertices with especially large search spaces can be pruned by other processes. We use this clique finder to investigate the size of the largest temporal strong components in dynamic networks, which requires finding the largest clique in a particular temporal reachability graph.}, keywords={maximum clique, max clique, clique, sparse networks, sparse graphs, dense graphs, parallel algorithms, heuristic clique finder, clique finder, PMC, super-linear speedup, temporal strong components} } @article{rossi2012fastclique, title={What if CLIQUE were fast? Maximum Cliques in Information Networks and Strong Components in Temporal Networks}, author={Ryan A. Rossi and David F. Gleich and Assefaw H. Gebremedhin and Mostofa A. Patwary}, journal={arXiv preprint arXiv:1210.5802}, pages={1--11}, year={2012}, pdf={pubs/maxclique_tscc.pdf}, pubtype={tech-report}, abstract={Exact maximum clique finders have progressed to the point where we can investigate cliques in million-node social and information networks, as well as find strongly connected components in temporal networks. We use one such finder to study a large collection of modern networks emanating from biological, social, and technological domains. We show inter-relationships between maximum cliques and several other common network properties, including network density, maximum core, and number of triangles. In temporal networks, we find that the largest temporal strong components have around 20-30% of the vertices of the entire network. These components represent groups of highly communicative individuals. In addition, we discuss and improve the performance and utility of the maximum clique finder itself.}, keywords={max-clique, temporal strong components, social networks, information networks} } @inproceedings{rossi2011modeling, title={Modeling Temporal Behavior in Large Networks: A Dynamic Mixed-Membership Model}, author={Ryan A. Rossi and Brian Gallagher and Jennifer Neville and Keith Henderson}, booktitle={DOE Scientific and Technical Information, LLNL-TR-514271}, year={2011}, pages={1--10}, pdf={pubs/DBMM.pdf}, external={http://www.osti.gov/scitech/biblio/1035597}, publisher={DOE}, abstract={Given a large time-evolving network, how can we model and characterize the temporal behaviors of individual nodes (and network states)? How can we model the behavioral transition patterns of nodes? We propose a temporal behavior model that captures the 'roles' of nodes in the graph and how they evolve over time. The proposed dynamic behavioral mixed-membership model (DBMM) is scalable, fully automatic (no user-defined parameters), non-parametric/data-driven (no specific functional form or parameterization), interpretable (identifies explainable patterns), and flexible (applicable to dynamic and streaming networks). Moreover, the interpretable behavioral roles are generalizable, computationally efficient, and natively supports attributes. We applied our model for (a) identifying patterns and trends of nodes and network states based on the temporal behavior, (b) predicting future structural changes, and (c) detecting unusual temporal behavior transitions. We use eight large real-world datasets from different time-evolving settings (dynamic and streaming). In particular, we model the evolving mixed-memberships and the corresponding behavioral transitions of Twitter, Facebook, IP-Traces, Email (University), Internet AS, Enron, Reality, and IMDB. The experiments demonstrate the scalability, flexibility, and effectiveness of our model for identifying interesting patterns, detecting unusual structural transitions, and predicting the future structural changes of the network and individual nodes.}, pubtype={tech-report}, } @inproceedings{rossi2009discovering, title={Discovering Latent Graphs with Positive and Negative Links to Eliminate Spam}, author={Ryan A. Rossi}, booktitle={JPL Tech. Report}, year={2009}, pages={1--9}, pubtype={tech-report}, pdf={pubs/LatentGraphTR-JPL-NASA.pdf}, external={http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.172.1204}, abstract={This paper proposes a new direction in Adversarial Information Retrieval through automatically ranking links. We use techniques based on Latent Semantic Analysis to define a novel algorithm to eliminate spam sites. Our model automatically creates, suppresses, and reinforces links. Using an appropriately weighted graph spam links are assigned substantially lower weights while links to normal sites are created and reinforced. The empirical validity of these techniques to eliminate and drastically decrease the impact of spam is shown by both our Local Sapling Heuristic and a machine learning algorithm used to classify sites with features derived from our Latent Graph.}, keywords={latent graph, adversarial information retrieval, negative link, reinforces link, new direction, weighted graph spam link, normal site, empirical validity, latent semantic analysis, ranking link, local sapling heuristic, spam site}, active={true}, } @phdthesis{rossi2015purdue, title={Improving Relational Machine Learning by Modeling Temporal Dependencies}, author={Ryan A. Rossi}, note={Ph.D. Dissertation, Purdue University}, school={Purdue University}, year={2015}, pages={163}, keywords={Machine Learning, Relational Learning, Classification, Dynamic Networks, Supervised Learning, Statistical Models, Statistical Relational Learning, Network Analysis, Parallel Algorithms, Dynamic Network Analysis, Relational Time Series Learning}, abstract={Networks encode dependencies between entities (people, computers, proteins) and allow us to study phenomena across social, technological, and biological domains. These networks naturally evolve over time by the addition, deletion, and changing of links, nodes, and attributes. Existing work in Relational Machine Learning (RML) has ignored relational time series data consisting of dynamic graphs and attributes, even despite the importance of modeling these dynamics. This dissertation investigates the problem of Relational Time-series Learning from dynamic attributed graph data, with the goal of improving the predictive quality of existing RML methods. In particular, we propose a framework for learning dynamic graph representations, as well as methods for the three representation discovery tasks of (i) dynamic node labeling, (ii) weighting, and (iii) prediction. In addition, techniques for modeling relational and temporal dependencies are proposed, along with efficient methods for discovering features, ensembles, as well as classification methods. The results demonstrate the importance of modeling both the relational and temporal dependencies as well as learning an appropriate graph data representation that captures these fundamental patterns. Furthermore, while previous work has focused on static graphs that are small, non-attributed, simple, or homogeneous, we instead have carefully designed generalized relational time-series models that are: (a) efficient with linear or nearly linear runtime, (b) scalable for big graph data, (c) flexible for a variety of data types and characteristics, and (d) capable of modeling attributed and heterogeneous relational time-series data. Finally, the proposed methods are shown to be scalable, effective, and flexible for a variety of real-world applications.}, publisher={ProQuest}, pdf={pubs/rossi-phd-dissertation-purdue15.pdf}, external={http://docs.lib.purdue.edu/dissertations/AAI3734519/}, pubtype={phd-dissertation}, active={true}, } @article{rossi2015purdue, title={Improving Relational Machine Learning by Modeling Temporal Dependencies}, author={Ryan A. Rossi}, journal={Ph.D. Dissertation, Purdue University}, school={Purdue University}, year={2015}, pages={163}, keywords={Machine Learning, Relational Learning, Classification, Dynamic Networks, Supervised Learning, Statistical Models, Statistical Relational Learning, Network Analysis, Parallel Algorithms, Dynamic Network Analysis, Relational Time Series Learning}, abstract={Networks encode dependencies between entities (people, computers, proteins) and allow us to study phenomena across social, technological, and biological domains. These networks naturally evolve over time by the addition, deletion, and changing of links, nodes, and attributes. Existing work in Relational Machine Learning (RML) has ignored relational time series data consisting of dynamic graphs and attributes, even despite the importance of modeling these dynamics. This dissertation investigates the problem of Relational Time-series Learning from dynamic attributed graph data, with the goal of improving the predictive quality of existing RML methods. In particular, we propose a framework for learning dynamic graph representations, as well as methods for the three representation discovery tasks of (i) dynamic node labeling, (ii) weighting, and (iii) prediction. In addition, techniques for modeling relational and temporal dependencies are proposed, along with efficient methods for discovering features, ensembles, as well as classification methods. The results demonstrate the importance of modeling both the relational and temporal dependencies as well as learning an appropriate graph data representation that captures these fundamental patterns. Furthermore, while previous work has focused on static graphs that are small, non-attributed, simple, or homogeneous, we instead have carefully designed generalized relational time-series models that are: (a) efficient with linear or nearly linear runtime, (b) scalable for big graph data, (c) flexible for a variety of data types and characteristics, and (d) capable of modeling attributed and heterogeneous relational time-series data. Finally, the proposed methods are shown to be scalable, effective, and flexible for a variety of real-world applications.}, publisher={ProQuest}, pdf={pubs/rossi-phd-dissertation-purdue15.pdf}, external={http://docs.lib.purdue.edu/dissertations/AAI3734519/}, pubtype={phd-dissertation}, active={false}, } @misc{rossi18oneClassSim, title={One-class Similarity Machines for Anomaly Detection}, author={Ryan A. Rossi and Ajay Raghavan and Jungho Park}, year={2017}, yearfiled={2018}, booktitle={Patent}, howpublished={Patent}, note={Patent application filed}, pubtype={patent}, abstract={One-class similarity machines is a family of unsupervised novelty/anomaly detection methods. We have used it for detecting anomalies in multi-variate time series data. Performance is comparable and sometimes better than one-class SVM. The method also has many advantages over one-class SVM, e.g., the target function is approximated locally and the invention also naturally handles changes in the problem domain. The method is a lazy-learning approach and therefore doesn't require training a model, yet can operate in a streaming fashion, which makes it exceptionally good for anomaly detection in time series data where the problem domain and underlying characteristics may be continuously evolving over time.}, keywords={one-class classification, novelty detection, similarity machines, unsupervised learning, time series, multivariate time series, anomaly detection, outlier detection, identifying unusual patterns, anomalies, outliers, machine learning, similarity-based learning, kernel function learning}, active={true}, } @misc{rossi18hone-patent, title={Higher-Order Network Embedding}, author={Ryan A. Rossi and Sungchul Kim and Eunyee Koh}, year={2018}, yearfiled={2018}, datefiled={11/29/2018}, booktitle={Patent}, howpublished={Patent}, note={Patent application filed, USPTO App. #16/204,616}, pubtype={patent}, abstract={In implementations of higher-order network embedding, a computing device maintains interconnected data in the form of a graph that represents a network, the graph including nodes that each represent entities in the network and node associations that each represent edges between the nodes in the graph. The computing device includes a network embedding module that is implemented to determine a frequency of k-vertex motifs for each of the edges in the graph, and derive motif-based matrices from the frequency of each of the k-vertex motifs in the graph. The network embedding module is also implemented to determine a higher-order network embedding for each of the nodes in the graph from each of the motif-based matrices. The network embedding module can then concatenate the higher-order network embeddings into a matrix representation.}, keywords={Higher-order network embeddings, network motifs, graph representation learning, network representation learning}, active={true}, } @misc{latent-network-summ-patent, author={Di Jin and Ryan A. Rossi and Eunyee Koh and Sungchul Kim and Anup Rao}, title={Latent Network Summarization}, year={2018}, yearfiled={2019}, datefiled={1/18/2019}, booktitle={Patent}, howpublished={Patent}, note={Patent application filed, USPTO App. #16/252,169}, pubtype={patent}, abstract={An important reason behind the prevalence of node representation learning is their superiority in downstream machine learning tasks on graphs. However, storing the vector-based node representation of massive real-world graphs often requires space that is orders of magnitude larger. To alleviate this issue, we introduce the problem of latent network summarization that is complementary to the problem of network embedding and propose a general framework called Multi-LENS. Instead of deriving dense node-wise representation, the goal of latent network summarization is to summarize the structural properties of the graph in a latent space with dimensionality that is independent of the nodes or edges in the input graph. The size-independent graph summary given by Multi-LENS consists of (i) a set of relational aggregators with their compositions (relational functions) that captures structural features of multi-order node-centric subgraphs, and (ii) the low-rank approximations to the induced matrices incorporating graph heterogeneity. In addition, Multi-LENS is able to derive node embeddings on the fly from this latent summarization due to its inductive properties. Multi-LENS bridges advantages brought by both embeddings and graph summarization, and applies to graphs with or without directionality, weights, attributes or labels. Extensive experiments on both synthetic and real-world graphs show that Multi-LENS achieves 2-89\% improvement in AUC for link prediction, while requiring less than 79x space compared to existing representation learning approaches. We also show the effectiveness of Multi-LENS summaries in detecting anomalies and events in the Enron email communication graph and Twitter co-mention graph.}, keywords={graph summarization, graph compression, heterogeneous networks, structural similarity, latent network summarization}, active={true}, } @misc{rossi18time-dependent-network-embedding, title={Time-Dependent Network Embedding}, author={Ryan A. Rossi and Sungchul Kim and Eunyee Koh}, year={2018}, yearfiled={2018}, datefiled={11/15/2018}, booktitle={Patent}, howpublished={Patent}, note={Patent application filed, USPTO App. #16/192,313}, pubtype={patent}, abstract={In implementations of time-dependent network embedding, a computing device maintains time-dependent interconnected data in the form of a time-based graph that includes nodes and node associations that each represent an edge between two of the nodes in the time-based graph based at least in part on a temporal value that indicates when the two nodes were associated. The computing device includes a network embedding module that is implemented to traverse one or more of the nodes in the timebased graph along the node associations, where the traversal is performed with respect to the temporal value of each of the edges that associate the nodes. The network embedding module is also implemented to determine a time-dependent embedding for each of the nodes traversed in the time-based graph, the time-dependent embedding for each of the respective nodes being representative of feature values that describe the respective node.}, keywords={Time-Dependent Network Embedding, Dynamic Networks, Temporal Graphs, Dynamic Network Embedding, Dynamic Network Representation Learning}, active={true}, } @misc{park18JointAnomaly, title={System and Method for Anomaly Characterization Based on Joint Historical and Time-series Analysis}, author={Jungho Park and Ajay Raghavan and Ryan A. Rossi and Yosuke Tajika and Akira Minegishi and Tetsuyoshi Ogura}, year={2017}, yearfiled={2018}, booktitle={Patent}, howpublished={Patent}, note={Patent application filed, USPTO App. #16/170,815}, pubtype={patent}, abstract={One embodiment provides a system for facilitating anomaly detection and characterization. During operation, the system determines, by a computing device, a first set of testing data which includes a plurality of data points, wherein the first set includes a data series for a first variable and one or more second variables. The system identifies anomalies by dividing the first set into a number of groups and performing an inter-quartile range analysis on data in each respective group. The system obtains, from the first set, a second set of testing data which includes a data series from a recent time period occurring before a current time, and which further includes a first data point from the identified anomalies. The system classifies the first data point as a first type of anomaly based on whether a magnitude of a derivative of the second set is greater than a first predetermined threshold.}, keywords={anomaly characterization, joint historical analysis, time-series analysis, time series, multivariate time series, anomaly detection, outlier detection, identifying unusual patterns, anomalies, outliers, sensor data}, active={true}, } @misc{ajay18binnedIQR, title={Binned IQR for Anomaly Detection in Multivariate Time Series}, author={Ajay Raghavan and Ryan A. Rossi and Jungho Park}, year={2017}, yearfiled={2018}, booktitle={Patent}, howpublished={Patent}, note={Patent application filed}, pubtype={patent}, abstract={This invention presents a new method to detect anomalies in multivariate time series data using "binned" IQR (Inter-Quartile Range). The IQR method is commonly used to detect anomaly in one-dimensional data. However, the IQR method does not perform well if the data instances have different behaviors according to other factors or dependent variables. The proposed binned IQR method divides the data into separate bins to overcome the above problem. Furthermore, this also avoids issues when anomalies may be missed if the time series or other variables represent sums. Thus, the proposed approach allows automated learning and detection of anomalies in multi-dimensional data for various applications such as identifying suspicious activity, reducing inefficiencies and improving productivity.}, keywords={time series, multivariate time series, IQR, anomaly detection, outlier detection, identifying unusual patterns, anomalies, outliers}, active={true}, } @misc{rossi17multilabel, title={Method and System for Similarity-based Multi-label Learning}, author={Ryan A. Rossi and Hoda Eldardiry}, year={2017}, yearfiled={2017}, booktitle={Patent}, howpublished={Patent}, note={Patent application filed, USPTO App. #16/237,439}, abstract={A system is provided for facilitating multi-label classification. During operation, the system maintains a set of training vectors. A respective vector represents an object and is associated with one or more labels that belong to a label set. After receiving an input vector, the system determines a similarity value between the input vector and one or more training vectors. The system further determines one or more labels associated with the input vector based on the similarity values between the input vector and the training vectors and their corresponding associated labels.}, pubtype={patent}, active={true}, } @misc{lee17graphAttention, title={System And Method For Graph Attention}, author={John Boaz Lee and Ryan Rossi}, year={2017}, yearfiled={2017}, booktitle={Patent}, howpublished={Patent}, note={Patent pending}, pubtype={patent}, active={false}, } @misc{rossi17attrWalks, title={Attributed Random Walks: a Generalization Of Deep Graph Learning And Embedding Algorithms}, author={Ryan A. Rossi and Rong Zhou}, year={2017}, yearfiled={2016}, booktitle={Patent}, howpublished={Patent}, note={Patent pending}, pubtype={patent}, active={false}, } @misc{rossi17deepRML, title={Deep Graph Representation Learning}, author={Ryan A. Rossi and Rong Zhou}, year={2019}, yearfiled={2016}, booktitle={Patent}, howpublished={Patent}, note={US Patent No. 10482375}, pubtype={patent}, active={true}, } @misc{rossi17graphSearchEngine, title={A Graph Search Engine}, author={Ryan A. Rossi and Rong Zhou}, year={2017}, yearfiled={2016}, booktitle={Patent}, howpublished={Patent}, note={Patent pending}, pubtype={patent}, active={true}, } @misc{rossi16deep-relational-learning, title={Deep Relational Learning}, author={Ryan A. Rossi and Rong Zhou}, year={2016}, yearfiled={2015}, booktitle={Patent}, howpublished={Patent}, note={Patent application filed}, pubtype={patent}, active={true}, } @misc{rossi16patent-graphlet-estimation, title={Fast and Accurate Unbiased Graphlet Estimation}, author={Ryan A. Rossi and Rong Zhou}, year={2016}, yearfiled={2015}, booktitle={Patent}, howpublished={Patent}, note={Patent application filed, USPTO App. #15/179724}, pubtype={patent}, } @misc{rossi15patent-hybrid-cpu-gpu-graphlets, title={Efficient Parallel Hybrid CPU-GPU Graph Mining and Learning via Induced Subgraph Features}, random={}, author={Ryan A. Rossi and Rong Zhou}, year={2016}, yearfiled={2016}, booktitle={Patent}, howpublished={Patent}, pubtype={patent}, note={Patent application filed}, } @misc{rossi15patent-clique-compression, title={System And Method For Compressing Graphs Via Cliques}, random={Foreign File EP, JP, KR: A System and Method for Compressing Graphs via Cliques to Speedup Graph Algorithms and Reduce Storage Requirements}, author={Ryan A. Rossi and Rong Zhou}, year={2016}, yearfiled={2015}, booktitle={Patent}, howpublished={Patent}, pubtype={patent}, note={Patent application filed, USPTO App. #15/183561}, } @misc{rossi16patent-localized-visual, title={Localized Visual Graph Filters for Complex Graph Queries}, author={Ryan A. Rossi and Rong Zhou}, year={2016}, yearfiled={2015}, pubtype={patent}, booktitle={Patent}, howpublished={Patent}, note={Patent application filed, USPTO App. #15/175751}, random={Foreign File}, } @misc{rossi16patent-rel-time-series, title={Computer-implemented System And Method For Relational Time Series Learning}, random={Relational Time Series Prediction using Maximum Similarity}, author={Ryan A. Rossi and Rong Zhou}, year={2016}, yearfiled={2014}, pubtype={patent}, booktitle={US Patent No. 10438130}, howpublished={US Patent No. 10438130}, note={Patent application filed, USPTO App. #14/955965}, } @misc{rossi16patent-pcmf, title={Parallel Collective Matrix Factorization Framework for Big Data}, author={Ryan A. Rossi and Rong Zhou}, year={2015}, yearfiled={2014}, booktitle={Patent}, pubtype={patent}, note={US Patent No. 10235403}, random={United States Patent Application 20160012088, filing date: Jul 8, 2014, publication date: Jan 14, 2016}, abstract={A system and a method perform matrix factorization. According to the system and the method, at least one matrix is received. The at least one matrix is to be factorized into a plurality of lower-dimension matrices defining a latent feature model. After receipt of the at least one matrix, the latent feature model is updated to approximate the at least one matrix. The latent feature model includes a plurality of latent features. Further, the update performed by cycling through the plurality of latent features at least once and alternatingly updating the plurality of lower-dimension matrices during each cycle.}, external={https://www.google.com/patents/US20160012088}, pdf={pubs/US20160012088A1.pdf}, } @inproceedings{higher-order-clustering-heter-arxiv, title={Higher-Order Clustering for Heterogeneous Networks via Typed Motifs}, author={Aldo G. Carranza and Ryan A. Rossi and Anup Rao and Eunyee Koh}, booktitle={arXiv:1810.02959}, pages={15}, year={2018}, pdf={pubs/Higher-Order-Motif-based-Clustering-for-Heterogeneous-Networks.pdf}, pubtype={}, abstract={Higher-order connectivity patterns such as small induced sub-graphs called graphlets (network motifs) are vital to understand the important components (modules/functional units) governing the configuration and behavior of complex networks. Existing work in higher-order clustering has focused on simple homogeneous graphs with a single node/edge type. However, heterogeneous graphs consisting of nodes and edges of different types are seemingly ubiquitous in the real-world. In this work, we introduce the notion of typed-graphlet that explicitly captures the rich (typed) connectivity patterns in heterogeneous networks. Using typed-graphlets as a basis, we develop a general principled framework for higher-order clustering in heterogeneous networks. The framework provides mathematical guarantees on the optimality of the higher-order clustering obtained. The experiments demonstrate the effectiveness of the framework quantitatively for three important applications including (i) clustering, (ii) link prediction, and (iii) graph compression. In particular, the approach achieves a mean improvement of 43x over all methods and graphs for clustering while achieving a 18.7% and 20.8% improvement for link prediction and graph compression, respectively.}, keywords={Clustering, higher-order clustering, spectral clustering, typed graphlets, network motifs, community detection, heterogeneous graphs, labeled graphs, graph mining}, active={true}, } @article{introBINF, author = {Jean-Louis Lassez and Ryan A. Rossi and Stephen Sheel}, title = {Introduction to Bioinformatics Using Action Labs}, journal = {Book ISBN 1329925912}, publisher = {}, pdf={http://ryanrossi.com/books/INTRO_BINF_BOOK.pdf}, year = {2009}, isbn = {1329925912}, pubtype = {book}, external = {http://www.amazon.com/Introduction-Bioinformatics-Using-Action-Labs/dp/1329925912}, abstract = {Bioinformatics is the application of computational techniques and tools to analyze and manage biological data. This book provides an introduction to bioinformatics through the use of Action Labs. These labs allow students to get experience using real data and tools to solve difficult problems. The book comes with supplementary software tools and papers. The labs use data from Breast Cancer, Liver Disease, Diabetes, SARS, HIV, Extinct Organisms, and many others. The book has been written for first or second year computer science, mathematics, and biology students.}, active={true}, }

Dr. Ryan A. Rossi

Senior Research Scientist, Adobe Research
Email: ryrossi [AT] adobe.com
Google Scholar: Ryan A. Rossi
LinkedIn: Ryan A. Rossi

For more recent and complete information, see my Curriculum Vitae.

I'm currently at Adobe Research. My research lies in the fields of graph machine learning and graph representation learning; and spans theory, algorithms, and applications of large complex relational (network/graph) data. Before joining Adobe Research, I had the opportunity to work at a number of industrial, government, and academic research labs including Palo Alto Research Center (Xerox PARC), Lawrence Livermore National Laboratory (LLNL), Naval Research Laboratory (NRL) AI Center, NASA Jet Propulsion Laboratory (JPL)/California Institute of Technology, University of Massachusetts Amherst, among others.

I earned my Ph.D. and M.S. in Computer Science at Purdue University. My dissertation work focused on relational machine learning and mining of dynamic network data. I'm very honored to be a recipient of the National Science Foundation Graduate Research Fellowship (NSF GRFP), National Defense Science and Engineering Graduate Fellowship (NDSEG), the Purdue Frederick N. Andrews Fellowship, and Bilsland Dissertation Fellowship awarded to Outstanding Ph.D. candidates. I have authored numerous papers at top conferences and journals as well as a number of patents. In addition, I'm also co-founder of the network repository project, the first data repository with interactive visual graph analytics to help researchers find, explore, and understand graph data in real-time over the web.

Ryan A. Rossi

Adobe Research

I am continuously looking for outstanding and highly motivated Ph.D. students for spring, fall, or summer research positions. If you're interested, please send me your CV and research interests.

Publications

For a complete list of my publications, see Google Scholar (Semantic Scholar, DBLP, Microsoft Academic or my Social Graph).
Other items of interest: My Erdös Number is 3 (Rossi–Duffield–Alon–Erdös). Mathematics Genealogy. Slideshare (for a few recent presentations). My ORCID.

Sort by year:

Research Experience

Senior Research Scientist, Adobe Research
Member of Research Staff, Palo Alto Research Center
Visiting Researcher, Palo Alto Research Center (Xerox PARC)
Research Fellow, Purdue University (2009-2012)

Research Assistant, Lawrence Livermore National Laboratory (ISCR)
LLNL Scholar: Cyber Defenders Program (2011-2012)

Research Assistant, Naval Research Laboratory, AI Research Center
Advisor: David Aha, Co-advisor: Luke McDowell, ONR NREIP
Relational Representation Discovery in Statistical Relational Learning, (Summer 2010)

Research Assistant, Coastal Carolina University (2005-2009)
Advisor: Jean-Louis Lassez, Retired IBM T.J. Watson Research Center
(Mathematics Genealogy Project)

Research Assistant, NASA Jet Propulsion Laboratory, (Summer 2009)
California Institute of Technology, Space Grant/USRP Fellowship
(Returned to continue my research).

Research Assistant, NASA Jet Propulsion Laboratory, (Spring 2009)
California Institute of Technology, USRP NASA Fellowship
Advisor: Mark Powell(Scalable Image Processing) and Khawaja Shams(Cloud Computing)

Research Assistant, University of Massachusetts at Amherst, KDL, (Summer 2008)
Advisor: David Jensen, Graduate Advisor: Brian Taylor, REU NSF Fellowship

Research Assistant, New Mexico Tech, Institute for Complex Additive Systems
Advisor: Srinivas Mukkamala, Senior Research Scientist, ICASA (Summer 2007)

Teaching Experience

Search Engine Theory, Instructor, Spring 2008
This course was taught from a machine learning perspective using a variety of resources and recent papers along with a series of homeworks and projects implementing the significant parts of a search engine.

Algorithms in Bioinformatics, Teaching Assistant, Fall 2007
Numerical Methods, Teaching Assistant, Spring 2007
Introduction to Bioinformatics, Teaching Assistant, Fa 2008, Fa/Spr 2007, Spr 2006
Introduction to Algorithm Design II, Teaching Assistant, Spring 2006
Introduction to Algorithm Design I, Teaching Assistant, Spring 2006

As a teaching assistant I gave lectures and review sessions; developed homeworks, labs, and programs, held office hours, and maintained course website.

Books / Lecture Notes

Bioinformatics is the application of computational techniques and tools to analyze and manage biological data. This book provides an Introduction to Bioinformatics through the use of Action Labs. These labs allow students to get experience using real data and tools to solve difficult problems. The book comes with supplementary slides, papers, and tools. The labs use data from Breast Cancer, Liver Disease, Diabetes, SARS, HIV, Extinct Organisms, and many others. The book has been written for first or second year computer science, mathematics, and biology students. The book is published by the Digital University Press. [pdf version] (6.2 MB)

$Internet Mathematics$ WSDM

Research Positions

Present 2015

Member of Research Staff

Palo Alto Research Center
2015 2009

Ph.D. Fellow

Purdue University, Computer Science
2015 2013

Visiting Researcher

Palo Alto Research Center

Education

Ph.D. 2015

Ph.D. in Computer Science

Purdue University
M.S.2013

Master of Science in Computer Science

Purdue University
B.S.2009

Bachelor of Science in Computer Science

CCU

Honors and Awards

2015

Purdue Bilsland Dissertation Fellowship
2012

DoD NDSEG Fellow
2009

National Science Foundation GRFP Award

Research Positions

2017-Present

Senior Research Scientist, Adobe Research
2015-2017

Member of Research Staff, Palo Alto Research Center (Xerox PARC)
2009-2015

Research Fellow, Department of Computer Science, Purdue University
2013-2015

Visiting Researcher, Palo Alto Research Center (PARC)
2011-2012

Research Assistant, Lawrence Livermore National Laboratory (ISCR)
Summer 2010

Research Assistant, Naval Research Laboratory (Artificial Intelligence Center)
2009

Research Assistant, NASA Jet Propulsion Laboratory/California Institute of Technology
Summer 2008

Research Assistant, University of Massachusetts at Amherst (UMass Amherst)