@inproceedings{bib_MoRA_2025, AUTHOR = {Kalakonda Sai Shashank, Maheshwari Shubh Jagmohan, Ravi Kiran Sarvadevabhatla}, TITLE = {MoRAG - Multi-Fusion Retrieval Augmented Generation for Human Motion}, BOOKTITLE = {Winter Conference on Applications of Computer Vision}. YEAR = {2025}}
We introduce MoRAG, a novel multi-part fusion based retrieval-augmented generation strategy for text-based human motion generation. The method enhances motion diffusion models by leveraging additional knowledge obtained through an improved motion retrieval process. By effectively prompting large language models (LLMs), we address spelling errors and rephrasing issues in motion retrieval. Our approach utilizes a multi-part retrieval strategy to improve the generalizability of motion retrieval across the language space. We create diverse samples through the spatial composition of the retrieved motions. Furthermore, by utilizing low-level, part-specific motion information, we can construct motion samples for unseen text descriptions. Our experiments demonstrate that our framework can serve as a plug-and-play module, improving the performance of motion diffusion models. Code, pretrained models and sample videos will be made available at: https://motionrag.github.io/
CrackUDA: Incremental Unsupervised Domain Adaptation for Improved Crack Segmentation in Civil Structures
Kushagra Srivastava,Damodar Datta,Rizvi Tahereen,Pradeep Kumar Ramancharla,Ravi Kiran Sarvadevabhatla,Harikumar Kandath
@inproceedings{bib_Crac_2024, AUTHOR = {Kushagra Srivastava, Damodar Datta, Rizvi Tahereen, Pradeep Kumar Ramancharla, Ravi Kiran Sarvadevabhatla, Harikumar Kandath}, TITLE = {CrackUDA: Incremental Unsupervised Domain Adaptation for Improved Crack Segmentation in Civil Structures}, BOOKTITLE = {International conference on Pattern Recognition}. YEAR = {2024}}
Crack segmentation plays a crucial role in ensuring the structural integrity and seismic safety of civil structures. However, existing crack segmentation algorithms encounter challenges in maintaining accuracy with domain shifts across datasets. To address this issue, we propose a novel deep network that employs incremental training with unsupervised domain adaptation (UDA) using adversarial learning, without a significant drop in accuracy in the source domain. Our approach leverages an encoder-decoder architecture, consisting of both domain-invariant and domain-specific parameters. The encoder learns shared crack features across all domains, ensuring robustness to domain variations. Simultaneously, the decoder's domain-specific parameters capture domain-specific features unique to each domain. By combining these components, our model achieves improved crack segmentation performance. Furthermore, we introduce BuildCrack, a new crack dataset comparable to sub-datasets of the well-established CrackSeg9K dataset in terms of image count and crack percentage. We evaluate our proposed approach against state-of-the-art UDA methods using different sub-datasets of CrackSeg9K and our custom dataset. Our experimental results demonstrate a significant improvement in crack segmentation accuracy and generalization across target domains compared to other UDA methods - specifically, an improvement of 0.65 and 2.7 mIoU on source and target domains respectively. Code, models, and dataset will be made available.
OLAF: A Plug-and-Play Framework for
Enhanced Multi-object Multi-part Scene Parsing
@inproceedings{bib_OLAF_2024, AUTHOR = {Pranav Gupta, Rishubh Singh, Pradeep Shenoy, Ravi Kiran Sarvadevabhatla}, TITLE = {OLAF: A Plug-and-Play Framework for
Enhanced Multi-object Multi-part Scene Parsing}, BOOKTITLE = {European Conference on Computer Vision}. YEAR = {2024}}
Enhancing Road Safety: Predictive Modeling of Accident-Prone Zones with ADAS-Equipped Vehicle Fleet Data
Ravi Shankar Mishra,Dev Singh Thakur,Anbumani Subramanian,Mukti Advani,S. Velmurugan,Juby Jose,Jawahar C V,Ravi Kiran Sarvadevabhatla
@inproceedings{bib_Enha_2024, AUTHOR = {Ravi Shankar Mishra, Dev Singh Thakur, Anbumani Subramanian, Mukti Advani, S. Velmurugan, Juby Jose, Jawahar C V, Ravi Kiran Sarvadevabhatla}, TITLE = {Enhancing Road Safety: Predictive Modeling of Accident-Prone Zones with ADAS-Equipped Vehicle Fleet Data}, BOOKTITLE = {Intelligent Vehicles symposium}. YEAR = {2024}}
This work presents a novel approach to identifying
possible early accident-prone zones in a large city-scale road network using geo-tagged collision alert data from a vehicle fleet. The alert data has been collected for a year from 200 city buses installed with the Advanced Driver Assistance System (ADAS). To the best of our knowledge, no research paper has used ADAS alerts to identify the early accident prone zones. A nonparametric technique called Kernel Density
Estimation (KDE) is employed to model the distribution of alert data across stratified time intervals. A novel recall-based measure is introduced to assess the degree of support provided by our density-based approach for existing, manually determined accident-prone zones (‘blackspots’) provided by civic
authorities. This shows that our KDE approach significantly outperforms existing approaches in terms of the recall-based measure. Introducing a novel linear assignment Earth Mover Distance based measure to predict previously unidentified accident-prone zones. The results and findings support the feasibility of utilizing alert data from vehicle fleets to aid civic
planners in assessing accident-zone trends and deploying traffic calming measures, thereby improving overall road safety and saving lives.
IDD-X: A Multi-View Dataset for Ego-relative Important Object Localization and Explanation in Dense and Unstructured Traffic
@inproceedings{bib_IDD-_2024, AUTHOR = {Chirag Parikh, Rohit Saluja, Jawahar C V, Ravi Kiran Sarvadevabhatla}, TITLE = {IDD-X: A Multi-View Dataset for Ego-relative Important Object Localization and Explanation in Dense and Unstructured Traffic}, BOOKTITLE = {International Conference on Robotics and Automation}. YEAR = {2024}}
Intelligent vehicle systems require a deep understanding of the interplay between road conditions, surrounding entities, and the ego vehicle's driving behavior for safe and efficient navigation. This is particularly critical in developing countries where traffic situations are often dense and unstructured with heterogeneous road occupants. Existing datasets, predominantly geared towards structured and sparse traffic scenarios, fall short of capturing the complexity of driving in such environments. To fill this gap, we present IDD-X, a large-scale dual-view driving video dataset. With 697K bounding boxes, 9K important object tracks, and 1-12 objects per video, IDD-X offers comprehensive ego-relative annotations for multiple important road objects covering 10 categories and 19 explanation label categories. The dataset also incorporates rearview information to provide a more complete representation of the driving environment. We also introduce custom-designed deep networks aimed at multiple important object localization and per-object explanation prediction. Overall, our dataset and introduced prediction models form the foundation for studying how road conditions and surrounding entities affect driving behavior in complex traffic situations.
MAdVerse: A Hierarchical Dataset of Multi-Lingual Ads from Diverse Sources and Categories
Keralapura Nagaraju Amruth Sagar,Rishabh Srivastava,Rakshitha R T,Venkata Kesav Venna,Ravi Kiran Sarvadevabhatla
@inproceedings{bib_MAdV_2024, AUTHOR = {Keralapura Nagaraju Amruth Sagar, Rishabh Srivastava, Rakshitha R T, Venkata Kesav Venna, Ravi Kiran Sarvadevabhatla}, TITLE = {MAdVerse: A Hierarchical Dataset of Multi-Lingual Ads from Diverse Sources and Categories}, BOOKTITLE = {IEEE Workshop on Applications of Computer Vision}. YEAR = {2024}}
The convergence of computer vision and advertising has sparked substantial interest lately. Existing advertisement datasets are either subsets of existing datasets with specialized annotations or feature diverse annotations without a cohesive taxonomy among ad images. Notably, no datasets encompass diverse advertisement styles or semantic grouping at various levels of granularity. Our work addresses this gap by introducing MAdVerse, an extensive, multilingual compilation of more than 50,000 ads from the web, social media websites, and e-newspapers. Advertisements are hierarchically grouped with uniform granularity into 11 categories, divided into 51 sub-categories, and 524 fine-grained brands at leaf level, each featuring ads in various languages. We provide comprehensive baseline classification results for prediction tasks within the realm of advertising analysis. These tasks include hierarchical ad classification, source classification, multilingual classification, and inducing hierarchy in existing ad datasets.
@inproceedings{bib_Seam_2023, AUTHOR = {Niharika, Rahul Krishna, Ravi Kiran Sarvadevabhatla}, TITLE = {SeamFormer : High Precision Text Line Segmentation for Handwritten Documents}, BOOKTITLE = {International Conference on Document Analysis and Recognition}. YEAR = {2023}}
Historical manuscripts often contain dense unstructured text lines. The large diversity in sizes, scripts and appearance makes precise text line segmentation extremely challenging. Existing line segmentation approaches often associate diacritic elements incorrectly to text lines and also address above mentioned challenges inadequately. To tackle these issues, we introduce SeamFormer, a novel approach for high precision text line segmentation in handwritten manuscripts. In the first stage of our approach, a multi-task Transformer deep network outputs coarse line identifiers which we term ‘scribbles’ and the binarized manuscript image. In the second stage, a scribble-conditioned seam generation procedure utilizes outputs from first stage and feature maps derived from manuscript image to generate tight-fitting line segmentation polygons. In the process, we incorporate a novel diacritic feature map which enables improved diacritic and text line associations. Via experiments and evaluations on new and existing challenging palm leaf manuscript datasets, we show that SeamFormer outperforms competing approaches and generates precise text line segmentations.
Kushagra Srivastava,Dhruv Patel,Aditya Kumar Jha,Mohhit Kumar Jha,Santosh Ravi Kiran,Pradeep Kumar Ramancharla,Harikumar K,K Madhava Krishna
@inproceedings{bib_UAV-_2023, AUTHOR = {Kushagra Srivastava, Dhruv Patel, Aditya Kumar Jha, Mohhit Kumar Jha, Santosh Ravi Kiran, Pradeep Kumar Ramancharla, Harikumar K, K Madhava Krishna}, TITLE = {UAV-Based Visual Remote Sensing for Automated Building Inspection}, BOOKTITLE = {European Conference on Computer Vision Workshops}. YEAR = {2023}}
Unmanned Aerial Vehicle (UAV) based remote sensing system incorporated with computer vision has demonstrated potential for assisting building construction and in disaster management like damage assessment during earthquakes. The vulnerability of a building to earthquake can be assessed through inspection that takes into account the expected damage progression of the associated component and the component’s contribution to structural system performance. Most of these inspections are done manually, leading to high utilization of manpower, time, and cost. This paper proposes a methodology to automate these inspections through UAV-based image data collection and a software library for post-processing that helps in estimating the seismic structural parameters. The key parameters considered here are the distances between adjacent buildings, building plan-shape, building
“Draw Fast, Guess Slow”: Characterizing Interactions in Cooperative Partially Observable Settings with Online Pictionary as a Case Study
@inproceedings{bib_“D_2023, AUTHOR = {Kiruthika K, Anandhini Rajendran, Vinoo A R, Santosh Ravi Kiran}, TITLE = {“Draw Fast, Guess Slow”: Characterizing Interactions in Cooperative Partially Observable Settings with Online Pictionary as a Case Study}, BOOKTITLE = {IFIP Conference on Human-Computer Interaction}. YEAR = {2023}}
Cooperative human-human communication becomes challenging when restrictions such as difference in communication modality and limited time are imposed. We use the popular cooperative social game Pictionary as an online multimodal test bed to explore the dynamics of human-human interactions in such settings. As a part of our study, we identify attributes of player interactions that characterize cooperative gameplay. We found stable and role-specific playing style components that are independent of game difficulty. In terms of gameplay and the larger context of cooperative partially observable communication, our results suggest that too much interaction or unbalanced interaction negatively impacts game success. Additionally
A Cloud-Fog Architecture for Video Analytics on Large Scale Camera Networks Using Semantic Scene Analysis
Kunal Jain,Adapa Kishan Sairam,Kunwar Shaanjeet Singh Grover,Santosh Ravi Kiran,Venkata Suresh Reddy Purini
@inproceedings{bib_A_Cl_2023, AUTHOR = {Kunal Jain, Adapa Kishan Sairam, Kunwar Shaanjeet Singh Grover, Santosh Ravi Kiran, Venkata Suresh Reddy Purini}, TITLE = {A Cloud-Fog Architecture for Video Analytics on Large Scale Camera Networks Using Semantic Scene Analysis}, BOOKTITLE = {International Symposium in Cluster, Cloud, and Grid Computing}. YEAR = {2023}}
This paper proposes a scalable distributed video analytics framework that can process thousands of video streams from sources such as CCTV cameras using semantic scene analysis. The main idea is to deploy deep learning pipelines on the fog nodes and generate semantic scene description records (SDRs) of video feeds from the associated CCTV cameras. These SDRs are transmitted to the cloud instead of video frames saving on network bandwidth. Using these SDRs stored on the cloud database, we can answer many complex queries and perform rich video analytics, within extremely low latencies. There is no need to scan and process the video streams again on a per query basis. The software architecture on the fog nodes allows for integrating new deep learning pipelines dynamically into the existing system, thereby supporting novel analytics and queries. We demonstrate the effectiveness of the system by proposing a novel distributed algorithm for real-time vehicle pursuit. The proposed algorithm involves asking multiple spatio-temporal queries in an adaptive fashion to reduce the query processing time and is robust to inaccuracies in the deployed deep learning pipelines and camera failures.
F3: fair and federated face attribute classification with heterogeneous data
Kanaparthy S V Samhita,P Manisha,Sankarshan Damle,Santosh Ravi Kiran,Sujit P Gujar
Pacific-Asia Conference on Knowledge Discovery and Data Mining, PAKDD, 2023
@inproceedings{bib_F3:__2023, AUTHOR = {Kanaparthy S V Samhita, P Manisha, Sankarshan Damle, Santosh Ravi Kiran, Sujit P Gujar}, TITLE = {F3: fair and federated face attribute classification with heterogeneous data}, BOOKTITLE = {Pacific-Asia Conference on Knowledge Discovery and Data Mining}. YEAR = {2023}}
Fairness across different demographic groups is an essential criterion for face-related tasks, Face Attribute Classification (FAC) being a prominent example. Apart from this trend, Federated Learning (FL) is increasingly gaining traction as a scalable paradigm for distributed training. Existing FL approaches require data homogeneity to ensure fairness. However, this assumption is too restrictive in real-world settings. We propose F3, a novel FL framework for fair FAC under data heterogeneity. F3 adopts multiple heuristics to improve fairness across different demographic groups without requiring data homogeneity assumption. We demonstrate the efficacy of F3 by reporting empirically observed fairness measures and accuracy guarantees on popular face datasets. Our results suggest that F3 strikes a practical balance between accuracy and fairness for FAC.
DSAG: A Scalable Deep Framework for Action-Conditioned Multi-Actor Full Body Motion Synthesis
Debtanu Gupta,Shubh Maheshwari,Kalakonda Sai Shashank,Manasvi Vaidyula,Santosh Ravi Kiran
Winter Conference on Applications of Computer Vision, WACV, 2023
@inproceedings{bib_DSAG_2023, AUTHOR = {Debtanu Gupta, Shubh Maheshwari, Kalakonda Sai Shashank, Manasvi Vaidyula, Santosh Ravi Kiran}, TITLE = {DSAG: A Scalable Deep Framework for Action-Conditioned Multi-Actor Full Body Motion Synthesis}, BOOKTITLE = {Winter Conference on Applications of Computer Vision}. YEAR = {2023}}
We introduce DSAG, a controllable deep neural framework for action-conditioned generation of full body multiactor variable duration actions. To compensate for incompletely detailed finger joints in existing large-scale datasets, we introduce full body dataset variants with detailed finger joints. To overcome shortcomings in existing generative approaches, we introduce dedicated representations for encoding finger joints. We also introduce novel spatiotemporal transformation blocks with multi-head self attention and specialized temporal processing. The design choices enable generations for a large range in body joint counts (24 - 52), frame rates (13 - 50), global body movement (inplace, locomotion) and action categories (12 - 120), across multiple datasets (NTU-120, HumanAct12, UESTC, Human3.6M). Our experimental results demonstrate DSAG’s significant improvements over state-of-the-art, its suitability for action-conditioned generation at scale.
Action-GPT: Leveraging Large-scale Language Models for Improved and Generalized Zero Shot Action Generation
Kalakonda Sai Shashank,Shubh Maheshwari,Santosh Ravi Kiran
International Conference on Multimedia and Expo, ICME, 2023
@inproceedings{bib_Acti_2023, AUTHOR = {Kalakonda Sai Shashank, Shubh Maheshwari, Santosh Ravi Kiran}, TITLE = {Action-GPT: Leveraging Large-scale Language Models for Improved and Generalized Zero Shot Action Generation}, BOOKTITLE = {International Conference on Multimedia and Expo}. YEAR = {2023}}
We introduce Action-GPT, a plug and play framework for incorporating Large Language Models (LLMs) into textbased action generation models. Action phrases in current motion capture datasets contain minimal and to-the-point information. By carefully crafting prompts for LLMs, we generate richer and fine-grained descriptions of the action. We show that utilizing these detailed descriptions instead of the original action phrases leads to better alignment of text and motion spaces. Our experiments show qualitative and quantitative improvement in the quality of synthesized motions produced by recent text-to-motion models. Code, pretrained models and sample videos will be made avail
PSUMNet: Unified Modality Part Streams are All You Need for Efficient Pose-based Action Recognition
Trivedi Neel Jayesh,Santosh Ravi Kiran
European Conference on Computer Vision Workshops, ECCV-W, 2022
@inproceedings{bib_PSUM_2022, AUTHOR = {Trivedi Neel Jayesh, Santosh Ravi Kiran}, TITLE = {PSUMNet: Unified Modality Part Streams are All You Need for Efficient Pose-based Action Recognition}, BOOKTITLE = {European Conference on Computer Vision Workshops}. YEAR = {2022}}
Pose-based action recognition is predominantly tackled by approaches which treat the input skeleton in a monolithic fashion, i.e. joints in the pose tree are processed as a whole. However, such approaches ignore the fact that action categories are often characterized by localized action dynamics involving only small subsets of part joint groups involving hands (e.g. `Thumbs up') or legs (e.g. `Kicking'). Although part-grouping based approaches exist, each part group is not considered within the global pose frame, causing such methods to fall short. Further, conventional approaches employ independent modality streams (e.g. joint, bone, joint velocity, bone velocity) and train their network multiple times on these streams, which massively increases the number of training parameters. To address these issues, we introduce PSUMNet, a novel approach for scalable and efficient pose-based action recognition. At the representation level, we propose a global frame based part stream approach as opposed to conventional modality based streams. Within each part stream, the associated data from multiple modalities is unified and consumed by the processing pipeline. Experimentally, PSUMNet achieves state of the art performance on the widely used NTURGB+D 60/120 dataset and dense joint skeleton dataset NTU 60-X/120-X. PSUMNet is highly efficient and outperforms competing methods which use 100%-400% more parameters. PSUMNet also generalizes to the SHREC hand gesture dataset with competitive performance. Overall, PSUMNet's scalability, performance and efficiency makes it an attractive choice for action recognition and for …
UAV-based Visual Remote Sensing for Automated Building Inspection
Kushagra Srivastava,Kushagra Srivastava,Aditya Kumar Jha,Mohhit Kumar Jha,Jaskirat Singh,Santosh Ravi Kiran,Pradeep Kumar Ramancharla,Harikumar K,K Madhava Krishna
European Conference on Computer Vision Workshops, ECCV-W, 2022
@inproceedings{bib_UAV-_2022, AUTHOR = {Kushagra Srivastava, Kushagra Srivastava, Aditya Kumar Jha, Mohhit Kumar Jha, Jaskirat Singh, Santosh Ravi Kiran, Pradeep Kumar Ramancharla, Harikumar K, K Madhava Krishna}, TITLE = {UAV-based Visual Remote Sensing for Automated Building Inspection}, BOOKTITLE = {European Conference on Computer Vision Workshops}. YEAR = {2022}}
Unmanned Aerial Vehicle (UAV) based remote sensing system incorporated with computer vision has demonstrated potential for assisting building construction and in disaster management like damage assessment during earthquakes. The vulnerability of a building to earthquake can be assessed through inspection that takes into account the expected damage progression of the associated component and the component’s contribution to structural system performance. Most of these inspections are done manually, leading to high utilization of manpower, time, and cost. This paper proposes a methodology to automate these inspections through UAV-based image data collection and a software library for post-processing that helps in estimating the seismic structural parameters. The key parameters considered here are the distances between adjacent buildings, building plan-shape, building plan area, objects on the rooftop and rooftop layout. The accuracy of the proposed methodology in estimating the above-mentioned parameters is verified through field measurements taken using a distance measuring sensor and also from the data obtained through Google Earth. Additional details and code can be accessed from h
DrawMon: A Distributed System for Detection of A typical Sketch Content in Concurrent Pictionary Games
NIKHIL BANSAL,KARTIK GUPTA,Kiruthika K,Pentapati Sivani,Santosh Ravi Kiran
ACM international conference on Multimedia, ACMMM, 2022
@inproceedings{bib_Draw_2022, AUTHOR = {NIKHIL BANSAL, KARTIK GUPTA, Kiruthika K, Pentapati Sivani, Santosh Ravi Kiran}, TITLE = {DrawMon: A Distributed System for Detection of A typical Sketch Content in Concurrent Pictionary Games}, BOOKTITLE = {ACM international conference on Multimedia}. YEAR = {2022}}
Pictionary, the popular sketch-based guessing game, provides an opportunity to analyze shared goal cooperative game play in restricted communication settings. However, some players occasionally draw atypical sketch content. While such content is occasionally relevant in the game context, it sometimes represents a rule violation and
A Fine-Grained Vehicle Detection (FGVD) Dataset for Unconstrained Roads
Prafful Kumar Khoba,Chirag Parikh,Rohit Saluja,Santosh Ravi Kiran,Jawahar C V
Indian Conference on Computer Vision, Graphics and Image Processing, ICVGIP, 2022
@inproceedings{bib_A_Fi_2022, AUTHOR = {Prafful Kumar Khoba, Chirag Parikh, Rohit Saluja, Santosh Ravi Kiran, Jawahar C V}, TITLE = {A Fine-Grained Vehicle Detection (FGVD) Dataset for Unconstrained Roads}, BOOKTITLE = {Indian Conference on Computer Vision, Graphics and Image Processing}. YEAR = {2022}}
The previous fine-grained datasets mainly focus on classification and are often captured in a controlled setup, with the camera focusing on the objects. We introduce the first Fine-Grained Vehicle Detection (FGVD) dataset in the wild, captured from a moving camera mounted on a car. It contains 5502 scene images with 210 unique fine-grained labels of multiple vehicle types organized in a three-level hierarchy. While previous classification datasets also include makes for different kinds of cars, the FGVD dataset introduces new class labels for categorizing two-wheelers, autorickshaws, and trucks. The FGVD dataset is challenging as it has vehicles in complex traffic scenarios with intra-class and inter-class variations in types, scale, pose, occlusion, and lighting conditions. The current object detectors like yolov5 and faster RCNN perform poorly on our dataset due to a lack of hierarchical modeling. Along with providing baseline results for existing object detectors on FGVD Dataset, we also present the results of a combination of an existing detector and the recent Hierarchical Residual Network (HRN) classifier for the FGVD task. Finally, we show that FGVD vehicle images are the most challenging to classify among the fine-grained datasets.
Counting in the 2020s: Binned Representations and Inclusive Performance Measures for Deep Crowd Counting Approaches
S Sravya Vardhani,Ashwin Gopinath,Ayush Gupta,Ganesh Ramakrishnan,Santosh Ravi Kiran,Santosh Ravi Kiran
JOURNAL OF LATEX CLASS FILES, JLCF, 2022
@inproceedings{bib_Coun_2022, AUTHOR = {S Sravya Vardhani, Ashwin Gopinath, Ayush Gupta, Ganesh Ramakrishnan, Santosh Ravi Kiran, Santosh Ravi Kiran}, TITLE = {Counting in the 2020s: Binned Representations and Inclusive Performance Measures for Deep Crowd Counting Approaches}, BOOKTITLE = {JOURNAL OF LATEX CLASS FILES}. YEAR = {2022}}
The data distribution in popular crowd counting datasets is typically heavy tailed and discontinuous. This skew affects all stages within the pipelines of deep crowd counting approaches. Specifically, the approaches exhibit unacceptably large standard deviation wrt statistical measures (MSE, MAE). To address such concerns in a holistic manner, we make two fundamental contributions. Firstly, we modify the training pipeline to accommodate the knowledge of dataset skew. To enable principled and balanced minibatch sampling, we propose a novel smoothed Bayesian binning approach. More specifically, we propose a novel cost function which can be readily incorporated into existing crowd counting deep networks to encourage binaware optimization. As the second contribution, we introduce additional performance measures which are more inclusive and throw light on various comparative performance aspects of the deep networks. We also show that our binning-based modifications retain their superiority wrt the newly proposed performance measures. Overall, our contributions enable a practically useful and detail-oriented characterization of performance for crowd counting approaches.
FLOAT: Factorized Learning of Object Attributes for Improved Multi-object Multi-part Scene Parsing
Rishubh Singh,Pranav Gupta,Pradeep Shenoy,Santosh Ravi Kiran
Computer Vision and Pattern Recognition, CVPR, 2022
@inproceedings{bib_FLOA_2022, AUTHOR = {Rishubh Singh, Pranav Gupta, Pradeep Shenoy, Santosh Ravi Kiran}, TITLE = {FLOAT: Factorized Learning of Object Attributes for Improved Multi-object Multi-part Scene Parsing}, BOOKTITLE = {Computer Vision and Pattern Recognition}. YEAR = {2022}}
Multi-object multi-part scene parsing is a challenging task which requires detecting multiple object classes in a scene and segmenting the semantic parts within each object. In this paper, we propose FLOAT, a factorized label space framework for scalable multi-object multi-part parsing. Our framework involves independent dense prediction of object category and part attributes which increases scalability and reduces task complexity compared to the monolithic label space counterpart. In addition, we propose an inference-time ‘zoom’ refinement technique which significantly improves segmentation quality, especially for smaller objects/parts. Compared to state of the art, FLOAT obtains an absolute improvement of 2.0% for mean IOU (mIOU) and 4.8% for segmentation quality IOU (sqIOU) on the Pascal-Part-58 dataset. For the larger Pascal-Part-108 dataset, the improvements are 2.1% for mIOU and 3.9% for
Detecting, Tracking and Counting Motorcycle Rider Traffic Violations on Unconstrained Roads
Aman Goyal,Dev Agarwa,Anbumani Subramanian,Jawahar C V,Santosh Ravi Kiran,Rohit Saluja
Technical Report, arXiv, 2022
@inproceedings{bib_Dete_2022, AUTHOR = {Aman Goyal, Dev Agarwa, Anbumani Subramanian, Jawahar C V, Santosh Ravi Kiran, Rohit Saluja}, TITLE = {Detecting, Tracking and Counting Motorcycle Rider Traffic Violations on Unconstrained Roads}, BOOKTITLE = {Technical Report}. YEAR = {2022}}
In many Asian countries with unconstrained road traffic conditions, driving violations such as not wearing helmets and triple-riding are a significant source of fatalities involving motorcycles. Identifying and penalizing such riders is vital in curbing road accidents and improving citizens’ safety. With this motivation, we propose an approach for detecting, tracking, and counting motorcycle riding violations in videos taken from a vehicle-mounted dashboard camera. We employ a curriculum learning-based object detector to better tackle challenging scenarios such as occlusions. We introduce a novel trapezium-shaped object boundary representation to increase robustness and tackle the rider-motorcycle association. We also introduce an amodal regressor that generates bounding boxes for the occluded riders. Experimental results on a large-scale unconstrained driving dataset demonstrate the superiority of our approach compared to existing approaches and other ablative variants.
MUGL: Large Scale Multi Person Conditional Action Generation with Locomotion
Shubh Maheshwari,Debtanu Gupta,Santosh Ravi Kiran
Winter Conference on Applications of Computer Vision, WACV, 2022
@inproceedings{bib_MUGL_2022, AUTHOR = {Shubh Maheshwari, Debtanu Gupta, Santosh Ravi Kiran}, TITLE = {MUGL: Large Scale Multi Person Conditional Action Generation with Locomotion}, BOOKTITLE = {Winter Conference on Applications of Computer Vision}. YEAR = {2022}}
We introduce MUGL, a novel deep neural model for large-scale, diverse generation of single and multi-person pose-based action sequences with locomotion. Our con- trollable approach enables variable-length generations cus- tomizable by action category, across more than 100 cate- gories. To enable intra/inter-category diversity, we model the latent generative space using a Conditional Gaussian Mixture Variational Autoencoder. To enable realistic gen- eration of actions involving locomotion, we decouple local pose and global trajectory components of the action se- quence. We incorporate duration-aware feature represen- tations to enable variable-length sequence generation. We use a hybrid pose sequence representation with 3D pose sequences sourced from videos and 3D Kinect-based se- quences of NTU-RGBD-120. To enable principled com- parison of generation quality, we employ suitably modi- fied strong baselines during evaluation. Although smaller and simpler compared to baselines, MUGL provides better quality generations, paving the way for practical and con- trollable large-scale human action generation.
Palmira: A Deep Deformable Network for Instance Segmentation of Dense and Uneven Layouts in Handwritten Manuscripts
S. P. Sharan,Aitha Sowmya,Amandeep Kumar,Abhishek Trivedi,Aaron Saju Augustine,Santosh Ravi Kiran
International Conference on Document Analysis and Recognition, ICDAR, 2021
Abs | | bib Tex
@inproceedings{bib_Palm_2021, AUTHOR = {S. P. Sharan, Aitha Sowmya, Amandeep Kumar, Abhishek Trivedi, Aaron Saju Augustine, Santosh Ravi Kiran}, TITLE = {Palmira: A Deep Deformable Network for Instance Segmentation of Dense and Uneven Layouts in Handwritten Manuscripts}, BOOKTITLE = {International Conference on Document Analysis and Recognition}. YEAR = {2021}}
Handwritten documents are often characterized by dense and uneven layout. Despite advances, standard deep network based approaches for semantic layout segmentation are not robust to complex deformations seen across semantic regions. This phenomenon is especially pronounced for the low-resource Indic palm-leaf manuscript domain. To address the issue, we first introduce Indiscapes2, a new large-scale diverse dataset of Indic manuscripts with semantic layout annotations. Indiscapes2 contains documents from four different historical collections and is larger than its predecessor, Indiscapes. We also propose a novel deep network Palmira for robust, deformation-aware instance segmentation of regions in handwritten manuscripts. We also report Hausdorff distance and its variants as a boundary-aware performance measure. Our experiments demonstrate that Palmira provides robust layouts, outperforms strong baseline approaches and ablative variants. We also include qualitative results on Arabic, South-East Asian and Hebrew historical manuscripts to showcase the generalization capability of Palmira.
MediTables: A New Dataset and Deep Network for Multi-category Table Localization in Medical Documents
Akshay Praveen Deshpande, Vaishnav Rao Potlapalli ,Santosh Ravi Kiran
International Conference on Document Analysis and Recognition Workshops, ICDAR-W, 2021
Abs | | bib Tex
@inproceedings{bib_Medi_2021, AUTHOR = {Akshay Praveen Deshpande, Vaishnav Rao Potlapalli , Santosh Ravi Kiran}, TITLE = {MediTables: A New Dataset and Deep Network for Multi-category Table Localization in Medical Documents}, BOOKTITLE = {International Conference on Document Analysis and Recognition Workshops}. YEAR = {2021}}
Localizing structured layout components such as tables is an important task in document image analysis. Numerous layout datasets with document images from various domains exist. However, healthcare and medical documents represent a crucial domain that has not been included so far. To address this gap, we contribute MediTables, a new dataset of 200 diverse medical document images with multi-category table annotations. Meditables contains a wide range of medical document images with variety in capture quality, layouts, skew, occlusion and illumination. The dataset images include pathology, diagnostic and hospital-related reports. In addition to document diversity, the dataset includes implicitly structured tables that are typically not present in other datasets. We benchmark state of the art table localization approaches on the MediTables dataset and introduce a custom-designed U-Net which exhibits robust performance while being drastically smaller in size compared to strong baselines. Our annotated dataset and models represent a useful first step towards the development of focused systems for medical document image analytics, a domain that mandates robust systems for reliable information retrieval. The dataset and models can be accessed at
Deformable deep networks for instance segmentation of overlapping multi page handwritten documents
Aitha Sowmya,Sindhu Bollampalli,Santosh Ravi Kiran
Indian Conference on Computer Vision, Graphics and Image Processing, ICVGIP, 2021
Abs | | bib Tex
@inproceedings{bib_Defo_2021, AUTHOR = {Aitha Sowmya, Sindhu Bollampalli, Santosh Ravi Kiran}, TITLE = {Deformable deep networks for instance segmentation of overlapping multi page handwritten documents}, BOOKTITLE = {Indian Conference on Computer Vision, Graphics and Image Processing}. YEAR = {2021}}
Digitizing via scanning the physical artifact often forms the first primary step in preserving historical handwritten manuscripts. To maximally utilize scanner surface area and minimize manual labor, multiple manuscripts are usually scanned together into a scanned image. Therefore, the first crucial task in manuscript content understanding is to ensure that each of the individual manuscripts within a scanned image can be isolated (segmented) on a per-instance basis. Existing deep network based approaches for manuscript layout understanding implicitly assume a single or two manuscripts per image. Since this assumption may be routinely violated, there is a need for a precursor system which extracts individual
MeronymNet: A Hierarchical Model for Unified and Controllable Multi-Category Object Generation
Janmejay Pratap Singh Baghel,Abhishek Trivedi,Tejas Ravichandran,Santosh Ravi Kiran
ACM international conference on Multimedia, ACMMM, 2021
@inproceedings{bib_Mero_2021, AUTHOR = {Janmejay Pratap Singh Baghel, Abhishek Trivedi, Tejas Ravichandran, Santosh Ravi Kiran}, TITLE = {MeronymNet: A Hierarchical Model for Unified and Controllable Multi-Category Object Generation}, BOOKTITLE = {ACM international conference on Multimedia}. YEAR = {2021}}
We introduce MeronymNet, a novel hierarchical approach for con- trollable, part-based generation of multi-category objects using a single unified model. We adopt a guided coarse-to-fine strategy involving semantically conditioned generation of bounding box layouts, pixel-level part layouts and ultimately, the object depic- tions themselves. We use Graph Convolutional Networks, Deep Recurrent Networks along with custom-designed Conditional Vari- ational Autoencoders to enable flexible, diverse and category-aware generation of 2-D objects in a controlled manner. The performance scores for generated objects reflect MeronymNet’s superior perfor- mance compared to multiple strong baselines and ablative variants.
BoundaryNet: An Attentive Deep Network with Fast Marching Distance Maps for Semi-automatic Layout Annotation
Abhishek Trivedi,Santosh Ravi Kiran
International Conference on Document Analysis and Recognition, ICDAR, 2021
@inproceedings{bib_Boun_2021, AUTHOR = {Abhishek Trivedi, Santosh Ravi Kiran}, TITLE = {BoundaryNet: An Attentive Deep Network with Fast Marching Distance Maps for Semi-automatic Layout Annotation}, BOOKTITLE = {International Conference on Document Analysis and Recognition}. YEAR = {2021}}
Precise boundary annotations of image regions can be crucial for downstream applications which rely on region-class semantics. Some document collections contain densely laid out, highly irregular and over- lapping multi-class region instances with large range in aspect ratio. Fully automatic boundary estimation approaches tend to be data intensive, cannot handle variable-sized images and produce sub-optimal results for aforementioned images. To address these issues, we propose Bound- aryNet, a novel resizing-free approach for high-precision semi-automatic layout annotation. The variable-sized user selected region of interest is first processed by an attention-guided skip network. The network opti- mization is guided via Fast Marching distance maps to obtain a good quality initial boundary estimate and an associated feature represen- tation. These outputs are processed by a Residual Graph Convolution Network optimized using Hausdorff loss to obtain the final region bound- ary. Results on a challenging image manuscript dataset demonstrate that BoundaryNet outperforms strong baselines and produces high-quality se- mantic region boundaries. Qualitatively, our approach generalizes across multiple document image datasets containing different script systems and layouts, all without additional fine-tuning. We integrate Bound- aryNet into a document annotation system and show that it provides high annotation throughput compared to manual and fully automatic alternatives. Keywords: document layout analysis · interactive · deep learning
Palmira: A Deep Deformable Network for Instance Segmentation of Dense and Uneven Layouts in Handwritten Manuscripts
S P Sharan,Sowmya Aitha,Amandeep Kumar,Abhishek Trivedi,Aaron Augustine,Santosh Ravi Kiran
International Conference on Document Analysis and Recognition, ICDAR, 2021
@inproceedings{bib_Palm_2021, AUTHOR = {S P Sharan, Sowmya Aitha, Amandeep Kumar, Abhishek Trivedi, Aaron Augustine, Santosh Ravi Kiran}, TITLE = {Palmira: A Deep Deformable Network for Instance Segmentation of Dense and Uneven Layouts in Handwritten Manuscripts}, BOOKTITLE = {International Conference on Document Analysis and Recognition}. YEAR = {2021}}
Handwritten documents are often characterized by dense and uneven layout. Despite advances, standard deep network based approaches for semantic layout segmentation are not robust to complex deformations seen across semantic regions. This phenomenon is especially pronounced for the low-resource Indic palm-leaf manuscript domain. To address the issue, we first introduce Indiscapes2, a new large-scale diverse dataset of Indic manuscripts with semantic layout annotations. Indiscapes2 contains documents from four different historical collections and is 150% larger than its predecessor, Indiscapes. We also propose a novel deep network Palmira for robust, deformation-aware instance segmentation of regions in handwritten manuscripts. We also report Hausdorff distance and its variants as a boundary-aware performance measure. Our experiments demonstrate that Palmira provides robust layouts, outperforms strong baseline approaches and ablative variants. We also include qualitative results on Arabic, South-East Asian and Hebrew historical manuscripts to showcase the generalization capability of Palmira. Keywords: instance segmentation · deformable convolutional network · historical document analysis · document image segmentation · datase
MeronymNet: A Hierarchical Approach for Unified and Controllable Multi-Category Object Generation
Rishabh Baghel,Abhishek Trivedi,Tejas Ravichandran,Santosh Ravi Kiran
ACM international conference on Multimedia, ACMMM, 2021
@inproceedings{bib_Mero_2021, AUTHOR = {Rishabh Baghel, Abhishek Trivedi, Tejas Ravichandran, Santosh Ravi Kiran}, TITLE = {MeronymNet: A Hierarchical Approach for Unified and Controllable Multi-Category Object Generation}, BOOKTITLE = {ACM international conference on Multimedia}. YEAR = {2021}}
We introduce MeronymNet, a novel hierarchical approach for controllable, part-based generation of multi-category objects using a single unified model. We adopt a guided coarse-to-fine strategy involving semantically conditioned generation of bounding box layouts, pixel-level part layouts and ultimately, the object depictions themselves. We use Graph Convolutional Networks, Deep Recurrent Networks along with custom-designed Conditional Variational Autoencoders to enable flexible, diverse and category-aware generation of 2-D objects in a controlled manner. The performance scores for generated objects reflect MeronymNet’s superior performance compared to multiple strong baselines and ablative variants.
Deformable Deep Networks for Instance Segmentation of Overlapping Multi Page Handwritten Documents
Aitha Sowmya,B Sindhu,Santosh Ravi Kiran
Indian Conference on Computer Vision, Graphics and Image Processing, ICVGIP, 2021
@inproceedings{bib_Defo_2021, AUTHOR = {Aitha Sowmya, B Sindhu, Santosh Ravi Kiran}, TITLE = {Deformable Deep Networks for Instance Segmentation of Overlapping Multi Page Handwritten Documents}, BOOKTITLE = {Indian Conference on Computer Vision, Graphics and Image Processing}. YEAR = {2021}}
Digitizing via scanning the physical artifact often forms the first primary step in preserving historical handwritten manuscripts. To maximally utilize scanner surface area and minimize manual labor, multiple manuscripts are usually scanned together into a scanned image. Therefore, the first crucial task in manuscript content understanding is to ensure that each of the individual manuscripts within a scanned image can be isolated (segmented) on a per-instance basis. Existing deep network based approaches for manuscript layout understanding implicitly assume a single or two manuscripts per image. Since this assumption may be routinely violated, there is a need for a precursor system which extracts individual manuscripts before downstream processing. Another challenge is the highly curved and deformed boundaries of manuscripts, causing them to often overlap with each other. To tackle such challenges, we introduce a new document image dataset called IMMI (Indic Multi Manuscript Images). To improve the efficiency of dataset and aid deep network training, we also propose an approach which generates synthetic images to augment sourced non-synthetic images. We conduct experiments using modified versions of existing document instance segmentation frameworks. The results demonstrate the efficacy of the new frameworks for the task. Overall, our contributions enable robust extraction of individual historical manuscript pages. This in turn, could potentially enable better performance on downstream tasks such as region-level instance segmentation within handwritten manuscripts and optical character recognition
Monocular multi-layer layout estimation for warehouse racks
Meher Shashwat Nigam,Puppala Avinash Prabhu,Anurag Sahu,Tanvi Karandikar,Puru Gupta,N. Sai Shankar,Santosh Ravi Kiran,K Madhava Krishna
Indian Conference on Computer Vision, Graphics and Image Processing, ICVGIP, 2021
@inproceedings{bib_Mono_2021, AUTHOR = {Meher Shashwat Nigam, Puppala Avinash Prabhu, Anurag Sahu, Tanvi Karandikar, Puru Gupta, N. Sai Shankar, Santosh Ravi Kiran, K Madhava Krishna}, TITLE = {Monocular multi-layer layout estimation for warehouse racks}, BOOKTITLE = {Indian Conference on Computer Vision, Graphics and Image Processing}. YEAR = {2021}}
Given a monocular color image of a warehouse rack, we aim to predict the bird's-eye view layout for each shelf in the rack, which we term as' multi-layer'layout prediction. To this end, we present RackLay, a deep neural network for real-time shelf layout estimation from a single image. Unlike previous layout estimation methods which provide a single layout for the dominant ground plane alone, RackLay estimates the top-viewand front-view layout for each shelf in the considered rack populated with objects. RackLay's architecture and its variants are versatile and estimate accurate layouts for diverse scenes characterized by varying number of visible shelves in an image, large range in shelf occupancy factor and varied background clutter. Given the extreme paucity of datasets in this space and the difficulty involved in acquiring real data from warehouses, we additionally release a flexible synthetic dataset generation …
Automatic quantification and visualization of street trees
Arpit Bahety,Rohit Saluja,Santosh Ravi Kiran,Anbumani subramanian,
Indian Conference on Computer Vision, Graphics and Image Processing, ICVGIP, 2021
@inproceedings{bib_Auto_2021, AUTHOR = {Arpit Bahety, Rohit Saluja, Santosh Ravi Kiran, Anbumani Subramanian, }, TITLE = {Automatic quantification and visualization of street trees}, BOOKTITLE = {Indian Conference on Computer Vision, Graphics and Image Processing}. YEAR = {2021}}
Assessing the number of street trees is essential for evaluating urban greenery and can help municipalities employ solutions to identify tree-starved streets. It can also help identify roads with different levels of deforestation and afforestation over time. Yet, there has been little work in the area of street trees quantification. This work first explains a data collection setup carefully designed for counting roadside trees. We then describe a unique annotation procedure aimed at robustly detecting and quantifying trees. We work on a dataset of around 1300 Indian road scenes annotated with over 2500 street trees. We additionally use the five held-out videos covering 25 km of roads for counting trees. We finally propose a street tree detection, counting, and visualization framework using current object detectors and a novel yet simple counting algorithm owing to the thoughtful collection setup. We find that the high-level …
MeronymNet: A Hierarchical Approach for Unified and Controllable Multi-Category Object Generation
Rishabh Baghel,Abhishek Trivedi,Tejas Ravichandran,Santosh Ravi Kiran
ACM international conference on Multimedia, ACMMM, 2021
@inproceedings{bib_Mero_2021, AUTHOR = {Rishabh Baghel, Abhishek Trivedi, Tejas Ravichandran, Santosh Ravi Kiran}, TITLE = {MeronymNet: A Hierarchical Approach for Unified and Controllable Multi-Category Object Generation}, BOOKTITLE = {ACM international conference on Multimedia}. YEAR = {2021}}
We introduce MeronymNet, a novel hierarchical approach for controllable, part-based generation of multi-category objects using a single unified model. We adopt a guided coarse-to-fine strategy involving semantically conditioned generation of bounding box layouts, pixel-level part layouts and ultimately, the object depictions themselves. We use Graph Convolutional Networks, Deep Recurrent Networks along with custom-designed Conditional Variational Autoencoders to enable flexible, diverse and category-aware generation of 2-D objects in a controlled manner. The performance scores for generated objects reflect MeronymNet’s superior performance compared to multiple strong baselines and ablative variants.
Boundary Net: an attentive deep network with fast marching distance maps for semi-automatic layout annotation
Abhishek Trivedi,Santosh Ravi Kiran
International Conference on Document Analysis and Recognition, ICDAR, 2021
@inproceedings{bib_Boun_2021, AUTHOR = {Abhishek Trivedi, Santosh Ravi Kiran}, TITLE = {Boundary Net: an attentive deep network with fast marching distance maps for semi-automatic layout annotation }, BOOKTITLE = {International Conference on Document Analysis and Recognition}. YEAR = {2021}}
Precise boundary annotations of image regions can be crucial for downstream applications which rely on region-class semantics. Some document collections contain densely laid out, highly irregular and overlapping multi-class region instances with large range in aspect ratio. Fully automatic boundary estimation approaches tend to be data intensive, cannot handle variable-sized images and produce sub-optimal results for aforementioned images. To address these issues, we propose BoundaryNet, a novel resizing-free approach for high-precision semi-automatic layout annotation. The variable-sized user selected region of interest is first processed by an attention-guided skip network. The network optimization is guided via Fast Marching distance maps to obtain a good quality initial boundary estimate and an associated feature representation. These outputs are processed by a Residual Graph Convolution Network optimized using Hausdorff loss to obtain the final region boundary. Results on a challenging image manuscript dataset demonstrate that BoundaryNet outperforms strong baselines and produces high-quality semantic region boundaries. Qualitatively, our approach generalizes across multiple document image datasets containing different script systems and layouts, all without additional fine-tuning. We integrate BoundaryNet into a document annotation system and show that it provides high annotation throughput compared to manual and fully automatic alternatives
NTU-X: An Enhanced Large-scale Dataset for Improving Pose-based Recognition of Subtle Human Actions
Trivedi Neel Jayesh,Anirudh Thatipelli,Santosh Ravi Kiran
Indian Conference on Computer Vision, Graphics and Image Processing, ICVGIP, 2021
@inproceedings{bib_NTU-_2021, AUTHOR = {Trivedi Neel Jayesh, Anirudh Thatipelli, Santosh Ravi Kiran}, TITLE = {NTU-X: An Enhanced Large-scale Dataset for Improving Pose-based Recognition of Subtle Human Actions}, BOOKTITLE = {Indian Conference on Computer Vision, Graphics and Image Processing}. YEAR = {2021}}
The lack of fine-grained joints (facial joints, hand fingers) is a fundamental performance bottleneck for state of the art skeleton action recognition models. Despite this bottleneck, community’s efforts seem to be invested only in coming up with novel architectures. To specifically address this bottleneck, we introduce two new pose based human action datasets - NTU60-X and NTU120-X. Our datasets extend the largest existing action recognition dataset, NTU-RGBD. In addition to the 25 body joints for each skeleton as in NTU-RGBD, NTU60-X and NTU120-X dataset includes finger and facial joints, enabling a richer skeleton representation. We appropriately modify the state of the art approaches to enable training using the introduced datasets. Our results demonstrate the effectiveness of these NTU-X datasets in overcoming the aforementioned bottleneck and improve state of the art performance, overall and on previously worst performing action categories. Code and pretrained models can be found at https://github.com/skelemoa/ntu-x
MediTables: A New Dataset and Deep Network for Multi-Category Table Localization in Medical Documents
Akshay Praveen Deshpande,Vaishnav Rao Potlapalli[,Santosh Ravi Kiran
International Conference on Document Analysis and Recognition Workshops, ICDAR-W, 2021
@inproceedings{bib_Medi_2021, AUTHOR = {Akshay Praveen Deshpande, Vaishnav Rao Potlapalli[, Santosh Ravi Kiran}, TITLE = {MediTables: A New Dataset and Deep Network for Multi-Category Table Localization in Medical Documents}, BOOKTITLE = {International Conference on Document Analysis and Recognition Workshops}. YEAR = {2021}}
Localizing structured layout components such as tables is an important task in document image analysis. Numerous layout datasets with document images from various domains exist. However, healthcare and medical documents represent a crucial domain that has not been included so far. To address this gap, we contribute MediTables, a new dataset of 200 diverse medical document images with multi-category table annotations. Meditables contains a wide range of medical document images with variety in capture quality, layouts, skew, occlusion and illumination. The dataset images include pathology, diagnostic and hospital-related reports. In addition to document diversity, the dataset includes implicitly structured tables that are typically not present in other datasets. We benchmark state of the art table localization approaches on the MediTables dataset and introduce a custom-designed U-Net which exhibits robust performance while being drastically smaller in size compared to strong baselines. Our annotated dataset and models represent a useful first step towards the development of focused systems for medical document image analytics, a domain that mandates robust systems for reliable information retrieval. The dataset and models can be accessed at https://github.com/atmacvit/meditables
DocVisor: A Multi-purpose Web-based Interactive Visualizer for Document ImageAnalytics
B V Khadiravana,Pranav Tadimeti,Santosh Ravi Kiran
International Conference on Document Analysis and Recognition Workshops, ICDAR-W, 2021
@inproceedings{bib_DocV_2021, AUTHOR = {B V Khadiravana, Pranav Tadimeti, Santosh Ravi Kiran}, TITLE = {DocVisor: A Multi-purpose Web-based Interactive Visualizer for Document ImageAnalytics}, BOOKTITLE = {International Conference on Document Analysis and Recognition Workshops}. YEAR = {2021}}
The performance for many document-based problems (OCR, Document Layout Segmentation, etc.) is typically studied in terms of a single aggregate performance measure (Intersection-Over-Union, Character Error Rate, etc.). While useful, the aggregation is a trade-off between instance-level analysis of predictions which may shed better light on a particular approach’s biases and performance characteristics. To enable a systematic understanding of instance-level predictions, we introduce DocVisor - a web-based multi-purpose visualization tool for analyzing the data and predictions related to various document image understanding problems. DocVisor provides support for visualizing data sorted using custom-specified performance metrics and display styles. It also supports the visualization of intermediate outputs (e.g., attention maps, coarse predictions) of the processing pipelines. This paper describes the appealing features of DocVisor and showcases its multi-purpose nature and general utility. We illustrate DocVisor’s functionality for four popular document understanding tasks – document region layout segmentation, tabular data detection, weakly-supervised document region segmentation and optical character recognition. DocVisor is available as a documented public repository for use by the communit
Wisdom of (Binned) Crowds: A Bayesian Stratification Paradigm for Crowd Counting
S Sravya Vardhani,Mansi Pradeep Khamkar,Divij Bajaj,Ganesh Ramakrishnan,Santosh Ravi Kiran
ACM international conference on Multimedia, ACMMM, 2021
@inproceedings{bib_Wisd_2021, AUTHOR = {S Sravya Vardhani, Mansi Pradeep Khamkar, Divij Bajaj, Ganesh Ramakrishnan, Santosh Ravi Kiran}, TITLE = {Wisdom of (Binned) Crowds: A Bayesian Stratification Paradigm for Crowd Counting}, BOOKTITLE = {ACM international conference on Multimedia}. YEAR = {2021}}
Datasets for training crowd counting deep networks are typically heavy-tailed in count distribution and exhibit discontinuities across the count range. As a result, the de facto statistical measures (MSE, MAE) exhibit large variance and tend to be unreliable indicators of performance across the count range. To address these concerns in a holistic manner, we revise processes at various stages of the standard crowd counting pipeline. To enable principled and balanced
BoundaryNet - An Attentive Deep Network with Fast Marching Distance Maps for Semi-automatic Layout Annotation
Abhishek Trivedi,Santosh Ravi Kiran
International Conference on Document Analysis and Recognition, ICDAR, 2021
@inproceedings{bib_Boun_2021, AUTHOR = {Abhishek Trivedi, Santosh Ravi Kiran}, TITLE = {BoundaryNet - An Attentive Deep Network with Fast Marching Distance Maps for Semi-automatic Layout Annotation}, BOOKTITLE = {International Conference on Document Analysis and Recognition}. YEAR = {2021}}
Precise boundary annotations of image regions can be crucial for downstream applications which rely on region-class semantics. Some document collections contain densely laid out, highly irregular and overlapping multi-class region instances with large range in aspect ratio. Fully automatic boundary estimation approaches tend to be data intensive, cannot handle variable-sized images and produce sub-optimal results for aforementioned images. To address these issues, we propose BoundaryNet, a novel resizing-free approach for high-precision semi-automatic layout annotation. The variable-sized user selected region of interest is first processed by an attention-guided skip network. The network optimization is guided via Fast Marching distance maps to obtain a good quality initial boundary estimate and an associated feature representation. These outputs are processed by a Residual Graph Convolution Network optimized using Hausdorff loss to obtain the final region boundary. Results on a challenging image manuscript dataset demonstrate that BoundaryNet outperforms strong baselines and produces high-quality semantic region boundaries. Qualitatively, our approach generalizes across multiple document image datasets containing different script systems and layouts, all without additional fine-tuning. We integrate BoundaryNet into a document annotation system and show that it provides high annotation throughput compared to manual and fully automatic alternatives.
Palmira: A Deep Deformable Network for Instance Segmentation of Dense and Uneven Layouts in Handwritten Manuscripts
S P Sharan,Aitha Sowmya, Amandeep kumar,Abhishek Trivedi,Aaron Saju Augustine,Santosh Ravi Kiran
International Conference on Document Analysis and Recognition, ICDAR, 2021
@inproceedings{bib_Palm_2021, AUTHOR = {S P Sharan, Aitha Sowmya, Amandeep Kumar, Abhishek Trivedi, Aaron Saju Augustine, Santosh Ravi Kiran}, TITLE = {Palmira: A Deep Deformable Network for Instance Segmentation of Dense and Uneven Layouts in Handwritten Manuscripts}, BOOKTITLE = {International Conference on Document Analysis and Recognition}. YEAR = {2021}}
Handwritten documents are often characterized by dense and uneven layout. Despite advances, standard deep network based approaches for semantic layout segmentation are not robust to complex deformations seen across semantic regions. This phenomenon is especially pronounced for the low-resource Indic palm-leaf manuscript domain. To address the issue, we first introduce Indiscapes2, a new large-scale diverse dataset of Indic manuscripts with semantic layout annotations. Indiscapes2 contains documents from four different historical collections and is 150% larger than its predecessor, Indiscapes. We also propose a novel deep network Palmira for robust, deformation-aware instance segmentation of regions in handwritten manuscripts. We also report Hausdorff distance and its variants as a boundary-aware performance measure. Our experiments demonstrate that Palmira provides robust layouts, outperforms strong baseline approaches and ablative variants. We also include qualitative results on Arabic, South-East Asian and Hebrew historical manuscripts to showcase the generalization capability of Palmira .
SYNTACTICALLY GUIDED GENERATIVE EMBEDDINGS FOR ZERO-SHOT SKELETON ACTION RECOGNITION
Pranay Gupta,Divyanshu Sharma,Santosh Ravi Kiran
International Conference on Image Processing, ICIP, 2021
@inproceedings{bib_SYNT_2021, AUTHOR = {Pranay Gupta, Divyanshu Sharma, Santosh Ravi Kiran}, TITLE = {SYNTACTICALLY GUIDED GENERATIVE EMBEDDINGS FOR ZERO-SHOT SKELETON ACTION RECOGNITION}, BOOKTITLE = {International Conference on Image Processing}. YEAR = {2021}}
We introduce SynSE, a novel syntactically guided generative approach for Zero-Shot Learning (ZSL). Our end-to-end approach learns progressively refined generative embedding spaces constrained within and across the involved modalities (visual, language). The inter-modal constraints are defined between action sequence embedding and embeddings of Parts of Speech (PoS) tagged words in the corresponding action description. We deploy SynSE for the task of skeleton-based action sequence recognition. Our design choices enable SynSE to generalize compositionally, i.e., recognize sequences whose action descriptions contain words not encountered during training. We also extend our approach to the more challenging Generalized Zero-Shot Learning (GZSL) problem via a confidence-based gating mechanism. We are the first to present zero-shot skeleton action recognition results on the largescale NTU-60 and NTU-120 skeleton action datasets with multiple splits. Our results demonstrate SynSE’s state of the art performance in both ZSL and GZSL settings compared to strong baselines on the NTU-60 and NTU-120 datasets.
Syntactically Guided Generative Embeddings for Zero-Shot Skeleton Action Recognition
Pranay Gupta,Divyanshu sharma,Santosh Ravi Kiran
Technical Report, arXiv, 2021
@inproceedings{bib_Synt_2021, AUTHOR = {Pranay Gupta, Divyanshu Sharma, Santosh Ravi Kiran}, TITLE = {Syntactically Guided Generative Embeddings for Zero-Shot Skeleton Action Recognition}, BOOKTITLE = {Technical Report}. YEAR = {2021}}
We introduce SynSE, a novel syntactically guided generative approach for Zero-Shot Learning (ZSL). Our end-to-end approach learns progressively refined generative embedding spaces constrained within and across the involved modalities (visual, language). The inter-modal constraints are defined between action sequence embedding and embeddings of Parts of Speech (PoS) tagged words in the corresponding action description. We deploy SynSE for the task of skeleton-based action sequence recognition. Our design choices enable SynSE to generalize compositionally, i.e., recognize sequences whose action descriptions contain words not encountered during training. We also extend our approach to the more challenging Generalized Zero-Shot Learning (GZSL) problem via a confidence-based gating mechanism. We are the first to present zero-shot skeleton action recognition results on the large-scale NTU-60 and NTU-120 skeleton action datasets with multiple splits. Our results demonstrate SynSE's state of the art performance in both ZSL and GZSL settings compared to strong baselines on the NTU-60 and NTU-120 datasets.
NTU60-X: Towards Skeleton-based Recognition of Subtle Human Actions
Anirudh Thatipelli,Trivedi Neel Jayesh,Santosh Ravi Kiran
Technical Report, arXiv, 2021
@inproceedings{bib_NTU6_2021, AUTHOR = {Anirudh Thatipelli, Trivedi Neel Jayesh, Santosh Ravi Kiran}, TITLE = {NTU60-X: Towards Skeleton-based Recognition of Subtle Human Actions}, BOOKTITLE = {Technical Report}. YEAR = {2021}}
The lack of fine-grained joints such as hand fingers is a fundamental performance bottleneck for state of the art skeleton action recognition models trained on the largest action recognition dataset, NTU-RGBD. To address this bottleneck, we introduce a new skeleton based human action dataset - NTU60-X. In addition to the 25 body joints for each skeleton as in NTU-RGBD, NTU60-X dataset includes finger and facial joints, enabling a richer skeleton representation. We appropriately modify the state of the art approaches to enable training using the introduced dataset. Our results demonstrate the effectiveness of NTU60-X in overcoming the aforementioned bottleneck and improve state of the art performance, overall and on hitherto worst performing action categories.
RackLay: Multi-Layer Layout Estimation for Warehouse Racks
Meher Shashwat Nigam,Puppala Avinash Prabhu,Anurag Sahu,Puru Gupta,Tanvi Karandikar,N. Sai Shankar,Santosh Ravi Kiran,K Madhava Krishna
Technical Report, arXiv, 2021
@inproceedings{bib_Rack_2021, AUTHOR = {Meher Shashwat Nigam, Puppala Avinash Prabhu, Anurag Sahu, Puru Gupta, Tanvi Karandikar, N. Sai Shankar, Santosh Ravi Kiran, K Madhava Krishna}, TITLE = {RackLay: Multi-Layer Layout Estimation for Warehouse Racks}, BOOKTITLE = {Technical Report}. YEAR = {2021}}
Given a monocular color image of a warehouse rack, we aim to predict the bird’s-eye view layout for each shelf in the rack, which we term as ‘multi-layer’ layout prediction. To this end, we present RackLay, a deep neural network for real-time shelf layout estimation from a single image. Unlike previous layout estimation methods which provide a single layout for the dominant ground plane alone, RackLay estimates the top-view and front-view layout for each shelf in the considered rack populated with objects. RackLay’s architecture and its variants are versatile and estimate accurate layouts for diverse scenes characterized by varying number of visible shelves in an image, large range in shelf occupancy factor and varied background clutter. Given the extreme paucity of datasets in this space and the difficulty involved in acquiring real data from warehouses, we additionally release a flexible synthetic dataset generation pipeline WareSynth which allows users to control the generation process and tailor the dataset according to contingent application. The ablations across architectural variants and comparison with strong prior baselines vindicate the efficacy of RackLay as an apt architecture for the novel problem of multi-layered layout estimation. We also show that fusing the top-view and front-view enables 3D reasoning applications such as metric free space estimation for the considered rack.
An OCR for Classical Indic Documents Containing Arbitrarily Long Words
Agam Dwivedi,Rohit Saluja,Santosh Ravi Kiran
Computer Vision and Pattern Recognition Conference workshops, CVPR-W, 2020
@inproceedings{bib_An_O_2020, AUTHOR = {Agam Dwivedi, Rohit Saluja, Santosh Ravi Kiran}, TITLE = {An OCR for Classical Indic Documents Containing Arbitrarily Long Words}, BOOKTITLE = {Computer Vision and Pattern Recognition Conference workshops}. YEAR = {2020}}
OCR for printed classical Indic documents written in Sanskrit is a challenging research problem. It involves complexities such as image degradation, lack of datasets and long-length words. Due to these challenges, the word accuracy of available OCR systems, both academic and industrial, is not very high for such documents. To address these shortcomings, we develop a Sanskrit specific OCR system. We present an attention-based LSTM model for reading Sanskrit characters in line images. We introduce a dataset of Sanskrit document images annotated at line level. To augment real data and enable high performance for our OCR, we also generate synthetic data via curated font selection and rendering designed to incorporate crucial glyph substitution rules. Consequently, our OCR achieves a word error rate of 15.97% and a character error rate of 3.71% on challenging Indic document texts and outperforms strong baselines. Overall, our contributions set the stage for application of OCRs on large corpora of classic Sanskrit texts containing arbitrarily long and highly conjoined words.
Early Bird: Loop Closures from Opposing Viewpoints for Perceptually-Aliased Indoor Environments
Satyajit Tourani,Dhagash Desai,Udit Singh Parihar,Sourav Garg,Santosh Ravi Kiran,Michael Milford,K Madhava Krishna
International Joint Conference on Computer Vision, Imaging and Computer Graphics Theory and Applicat, VISIGRAPP, 2020
@inproceedings{bib_Earl_2020, AUTHOR = {Satyajit Tourani, Dhagash Desai, Udit Singh Parihar, Sourav Garg, Santosh Ravi Kiran, Michael Milford, K Madhava Krishna}, TITLE = {Early Bird: Loop Closures from Opposing Viewpoints for Perceptually-Aliased Indoor Environments}, BOOKTITLE = {International Joint Conference on Computer Vision, Imaging and Computer Graphics Theory and Applicat}. YEAR = {2020}}
Significant advances have been made recently in Visual Place Recognition (VPR), feature correspondence, and localization due to the proliferation of deep-learning-based methods. However, existing approaches tend to address, partially or fully, only one of two key challenges: viewpoint change and perceptual aliasing. In this paper, we present novel research that simultaneously addresses both challenges by combining deep-learned features with geometric transformations based on reasonable domain assumptions about navigation on a ground-plane, whilst also removing the requirement for specialized hardware setup (eg lighting, downwards facing cameras). In particular, our integration of VPR with SLAM by leveraging the robustness of deep-learned features and our homography-based extreme viewpoint invariance significantly boosts the performance of VPR, feature correspondence, and pose graph submodules of the SLAM pipeline. For the first time, we demonstrate a localization system capable of state-of-the-art performance despite perceptual aliasing and extreme 180-degree-rotated viewpoint change in a range of real-world and simulated experiments. Our system is able to achieve early loop closures that prevent significant drifts in SLAM trajectories. We also compare extensively several deep architectures for VPR and descriptor matching. We also show that superior place recognition and descriptor matching across opposite views results in a similar performance gain in back-end pose graph optimization.
Pictionary-Style Word Guessing on Hand-Drawn Object Sketches: Dataset, Analysis and Deep Network Models
Santosh Ravi Kiran,Shiv Surya,Trisha Mittal, R. Venkatesh Babu
IEEE Transaction on Pattern Analysis Machine Intelligence, TPAMI, 2020
@inproceedings{bib_Pict_2020, AUTHOR = {Santosh Ravi Kiran, Shiv Surya, Trisha Mittal, R. Venkatesh Babu}, TITLE = {Pictionary-Style Word Guessing on Hand-Drawn Object Sketches: Dataset, Analysis and Deep Network Models}, BOOKTITLE = {IEEE Transaction on Pattern Analysis Machine Intelligence}. YEAR = {2020}}
The ability of intelligent agents to play games in human-like fashion is popularly considered a benchmark of progress in Artificial Intelligence. In our work, we introduce the first computational model aimed at Pictionary, the popular word-guessing social game. We first introduce Sketch-QA, a guessing task. Styled after Pictionary, Sketch-QA uses incrementally accumulated sketchstroke sequences as visual data. Sketch-QA involves asking a fixed question (“What object is being drawn?”) and gathering open-ended guess-words from human guessers. We analyze the resulting dataset and present many interesting findings therein. Tomimic Pictionary-style guessing, we propose a deep neural model which generates guess-words in response to temporally evolving human-drawn object sketches. Our model even makes human-like mistakes while guessing, thus amplifying the human mimicry factor.We evaluate our model on the large-scale guess-word dataset generated via Sketch-QA task and compare with various baselines. Wealso conduct a Visual Turing Test to obtain human impressions of the guess-words generated by humans and our model. Experimental results demonstrate the promise of our approach for Pictionary and similarly themed games.
Topological Mapping for Manhattan-like Repetitive Environments
Sai Shubodh Puligilla,Satyajit Tourani,Vaidya Tushar Shridhar,Udit Singh Parihar,Santosh Ravi Kiran,K Madhava Krishna
International Conference on Robotics and Automation, ICRA, 2020
@inproceedings{bib_Topo_2020, AUTHOR = {Sai Shubodh Puligilla, Satyajit Tourani, Vaidya Tushar Shridhar, Udit Singh Parihar, Santosh Ravi Kiran, K Madhava Krishna}, TITLE = {Topological Mapping for Manhattan-like Repetitive Environments}, BOOKTITLE = {International Conference on Robotics and Automation}. YEAR = {2020}}
We showcase a topological mapping framework for a challenging indoor warehouse setting. At the most abstract level, the warehouse is represented as a Topological Graph where the nodes of the graph represent a particular warehouse topological construct (e.g. rackspace, corridor) and the edges denote the existence of a path between two neighbouring nodes or topologies. At the intermediate level, the map is represented as a Manhattan Graph where the nodes and edges are characterized by Manhattan properties and as a Pose Graph at the lower-most level of detail. The topological constructs are learned via a Deep Convolutional Network while the relational properties between topological instances are learnt via a Siamese-style Neural Network. In the paper, we show that maintaining abstractions such as Topological Graph and Manhattan Graph help in recovering an accurate Pose Graph starting from a highly erroneous and unoptimized Pose Graph. We show how this is achieved by embedding topological and Manhattan relations as well as Manhattan Graph aided loop closure relations as constraints in the backend Pose Graph optimization framework. The recovery of near ground-truth Pose Graph on real-world indoor ware
OPAL-Net: A Generative Model for Part-based Object Layout Generation
Rishabh Chandra,Santosh Ravi Kiran
Technical Report, arXiv, 2020
@inproceedings{bib_OPAL_2020, AUTHOR = {Rishabh Chandra, Santosh Ravi Kiran}, TITLE = {OPAL-Net: A Generative Model for Part-based Object Layout Generation}, BOOKTITLE = {Technical Report}. YEAR = {2020}}
We propose OPAL-Net, a novel hierarchical architecture for part-based layout generation of objects from multiple categories using a single unified model. We adopt a coarse-to-fine strategy involving semantically conditioned autoregressive generation of bounding box layouts and pixel-level part layouts for objects. We use Graph Convolutional Networks, Deep Recurrent Networks along with custom-designed Conditional Variational Autoencoders to enable flexible, diverse and category-aware generation of object layouts. We train OPAL-Net on PASCAL-Parts dataset. The generated samples and corresponding evaluation scores demonstrate the versatility of OPAL-Net compared to ablative variants and baseline
Indiscapes: Instance segmentation networks for layout parsing of historical indic manuscripts
Abhishek prusty,Aitha Sowmya,Abhishek Trivedi,Santosh Ravi Kiran
International Conference on Document Analysis and Recognition, ICDAR, 2019
@inproceedings{bib_Indi_2019, AUTHOR = {Abhishek Prusty, Aitha Sowmya, Abhishek Trivedi, Santosh Ravi Kiran}, TITLE = {Indiscapes: Instance segmentation networks for layout parsing of historical indic manuscripts}, BOOKTITLE = {International Conference on Document Analysis and Recognition}. YEAR = {2019}}
Historical palm-leaf manuscript and early paper documents from Indian subcontinent form an important part of the world's literary and cultural heritage. Despite their importance, large-scale annotated Indic manuscript image datasets do not exist. To address this deficiency, we introduce Indiscapes, the first ever dataset with multi-regional layout annotations for historical Indic manuscripts. To address the challenge of large diversity in scripts and presence of dense, irregular layout elements (e.g. text lines, pictures, multiple documents per image), we adapt a Fully Convolutional Deep Neural Network architecture for fully automatic, instance-level spatial layout parsing of manuscript images. We demonstrate the effectiveness of proposed architecture on images from the Indiscapes dataset. For annotation flexibility and keeping the non-technical nature of domain experts in mind, we also contribute a custom, web-based …
HInDoLA: A Unified Cloud-based Platform for Annotation, Visualization and Machine Learning-based Layout Analysis of Historical Manuscripts
Abhishek Trivedi,Santosh Ravi Kiran
International Conference on Document Analysis and Recognition Workshops, ICDAR-W, 2019
@inproceedings{bib_HInD_2019, AUTHOR = {Abhishek Trivedi, Santosh Ravi Kiran}, TITLE = {HInDoLA: A Unified Cloud-based Platform for Annotation, Visualization and Machine Learning-based Layout Analysis of Historical Manuscripts}, BOOKTITLE = {International Conference on Document Analysis and Recognition Workshops}. YEAR = {2019}}
Palm-leaf manuscripts are one of the oldest medium of inscription in many Asian countries. Especially, manuscripts from the Indian subcontinent form an important part of the world's literary and cultural heritage. Despite their significance, large-scale datasets for layout parsing and targeted annotation systems do not exist. Addressing this, we propose a web-based layout annotation and analytics system. Our system, called HInDoLA, features an intuitive annotation GUI, a graphical analytics dashboard and interfaces with machine-learning based intelligent modules on the backend. HInDoLA has successfully helped us create the first ever large-scale dataset for layout parsing of Indic palm-leaf manuscripts. These manuscripts, in turn, have been used to train and deploy deep-learning based modules for fully automatic and semi-automatic instance-level layout parsing.
Operator-in-the-Loop Deep Sequential Multi-Camera Feature Fusion for Person Re-Identification
K. L. Navaneet,Santosh Ravi Kiran,SHASHANK SHEKHAR,R. Venkatesh Babu,Anirban Chakraborty
IEEE Transactions on Information Forensics and Security, TIFS, 2019
@inproceedings{bib_Oper_2019, AUTHOR = {K. L. Navaneet, Santosh Ravi Kiran, SHASHANK SHEKHAR, R. Venkatesh Babu, Anirban Chakraborty}, TITLE = {Operator-in-the-Loop Deep Sequential Multi-Camera Feature Fusion for Person Re-Identification}, BOOKTITLE = {IEEE Transactions on Information Forensics and Security}. YEAR = {2019}}
Given a target image as query, person re-identification systems retrieve a ranked list of candidate matches on a per-camera basis. In deployed systems, a human operator scans these lists and labels sighted targets by touch or mouse-based selection. However, classical re-id approaches generate per-camera lists independently. Therefore, target identifications by operator in a subset of cameras cannot be utilized to improve ranking of the target in remaining set of network cameras. To address this shortcoming, we propose a novel sequential multi-camera re-id approach. The proposed approach can accommodate human operator inputs and provides early gains via a monotonic improvement in target ranking. At the heart of our approach is a fusion function which operates on deep feature representations of query and candidate matches. We formulate an optimization procedure custom-designed to incrementally …
Game of sketches: Deep recurrent models of pictionary-style word guessing
Santosh Ravi Kiran, Shiv Surya,Trisha Mittal, R. Venkatesh Babu
AAAI Conference on Artificial Intelligence, AAAI, 2018
@inproceedings{bib_Game_2018, AUTHOR = {Santosh Ravi Kiran, Shiv Surya, Trisha Mittal, R. Venkatesh Babu}, TITLE = {Game of sketches: Deep recurrent models of pictionary-style word guessing}, BOOKTITLE = {AAAI Conference on Artificial Intelligence}. YEAR = {2018}}
The ability of machine-based agents to play games in human-like fashion is considered a benchmark of progress in AI. In this paper, we introduce the first computational model aimed at Pictionary, the popular word-guessing social game. We first introduce Sketch-QA, an elementary version of Visual Question Answering task. Styled after Pictionary, Sketch-QA uses incrementally accumulated sketch stroke sequences as visual data. Notably, Sketch-QA involves asking a fixed question (" What object is being drawn?") and gathering open-ended guess-words from human guessers. To mimic Pictionary-style guessing, we propose a deep neural model which generates guess-words in response to temporally evolving human-drawn sketches. Our model even makes human-like mistakes while guessing, thus amplifying the human mimicry factor. We evaluate our model on the large-scale guess-word dataset generated via Sketch-QA task and compare with various baselines. We also conduct a Visual Turing Test to obtain human impressions of the guess-words generated by humans and our model. Experimental results demonstrate the promise of our approach for Pictionary and similarly themed games.
Operator-In-The-Loop Deep Sequential Multi-camera Feature Fusion for Person Re-identification
K. L. Navaneet,Santosh Ravi Kiran,SHASHANK SHEKHAR,R. Venkatesh Babu,Anirban Chakraborty
IEEE Transactions on Information Forensics and Security, TIFS, 2018
@inproceedings{bib_Oper_2018, AUTHOR = {K. L. Navaneet, Santosh Ravi Kiran, SHASHANK SHEKHAR, R. Venkatesh Babu, Anirban Chakraborty}, TITLE = {Operator-In-The-Loop Deep Sequential Multi-camera Feature Fusion for Person Re-identification}, BOOKTITLE = {IEEE Transactions on Information Forensics and Security}. YEAR = {2018}}
Given a target image as query, person re-identification systems retrieve a ranked list of candidate matches on a per-camera basis. In deployed systems, a human operator scans these lists and labels sighted targets by touch or mouse-based selection. However, classical re-id approaches generate per-camera lists independently. Therefore, target identifications by operator in a subset of cameras cannot be utilized to improve ranking of the target in remaining set of network cameras. To address this shortcoming, we propose a novel sequential multi-camera re-id approach. The proposed approach can accommodate human operator inputs and provides early gains via a monotonic improvement in target ranking. At the heart of our approach is a fusion function which operates on deep feature representations of query and candidate matches. We formulate an optimization procedure custom-designed to incrementally improve query representation. Since existing evaluation methods cannot be directly adopted to our setting, we also propose two novel evaluation protocols. The results on two large-scale re-id datasets (Market-1501, DukeMTMC-reID) demonstrate that our multi-camera method significantly outperforms baselines and other popular feature fusion schemes. Additionally, we conduct a comparative subject-based study of human operator performance. The superior operator performance enabled by our approach makes a compelling case for its integration into deployable video-surveillance systems.
Deligan: Generative adversarial networks for diverse and limited data
Swaminathan Gurumurthy,Santosh Ravi Kiran,R. Venkatesh Babu
Computer Vision and Pattern Recognition, CVPR, 2017
@inproceedings{bib_Deli_2017, AUTHOR = {Swaminathan Gurumurthy, Santosh Ravi Kiran, R. Venkatesh Babu}, TITLE = {Deligan: Generative adversarial networks for diverse and limited data}, BOOKTITLE = {Computer Vision and Pattern Recognition}. YEAR = {2017}}
A class of recent approaches for generating images, called Generative Adversarial Networks (GAN), have been used to generate impressively realistic images of objects, bedrooms, handwritten digits and a variety of other image modalities. However, typical GAN-based approaches require large amounts of training data to capture the diversity across the image modality. In this paper, we propose DeLiGAN--a novel GAN-based architecture for diverse and limited training data scenarios. In our approach, we reparameterize the latent generative space as a mixture model and learn the mixture model's parameters along with those of GAN. This seemingly simple modification to the GAN framework is surprisingly effective and results in models which enable diversity in generated samples although trained with limited data. In our work, we show that DeLiGAN can generate images of handwritten digits, objects and hand-drawn sketches, all using limited amounts of data. To quantitatively characterize intra-class diversity of generated samples, we also introduce a modified version of" inception-score", a measure which has been found to correlate well with human assessment of generated samples
Object category understanding via eye fixations on freehand sketches
Santosh Ravi Kiran,Sudharshan Suresh,R. Venkatesh Babu
IEEE Transactions on Image Processing, TIP, 2017
@inproceedings{bib_Obje_2017, AUTHOR = {Santosh Ravi Kiran, Sudharshan Suresh, R. Venkatesh Babu}, TITLE = {Object category understanding via eye fixations on freehand sketches}, BOOKTITLE = {IEEE Transactions on Image Processing}. YEAR = {2017}}
The study of eye gaze fixations on photographic images is an active research area. In contrast, the image sub-category of freehand sketches has not received as much attention for such studies. In this paper, we analyze the results of a free-viewing gaze fixation study conducted on 3904 freehand sketches distributed across 160 object categories. Our analysis shows that fixation sequences exhibit marked consistency within a sketch, across sketches of a category and even across suitably grouped sets of categories. This multi-level consistency is remarkable given the variability in depiction and extreme image content sparsity that characterizes hand-drawn object sketches. In this paper, we show that the multi-level consistency in the fixation data can be exploited to 1) predict a test sketch's category given only its fixation sequence and 2) build a computational model which predicts part-labels underlying fixations on objects. We hope that our findings motivate the community to deem sketch-like representations worthy of gaze-based studies vis-a-vis photographic images.
SketchParse: Towards rich descriptions for poorly drawn sketches using multi-task hierarchical deep networks
Santosh Ravi Kiran,Isht Dwivedi,Abhijat Biswas,Sahil Manocha,Venkatesh Babu R.
International Conference on Multimedia, IMM, 2017
@inproceedings{bib_Sket_2017, AUTHOR = {Santosh Ravi Kiran, Isht Dwivedi, Abhijat Biswas, Sahil Manocha, Venkatesh Babu R.}, TITLE = {SketchParse: Towards rich descriptions for poorly drawn sketches using multi-task hierarchical deep networks}, BOOKTITLE = {International Conference on Multimedia}. YEAR = {2017}}
The ability to semantically interpret hand-drawn line sketches, although very challenging, can pave way for novel applications in multimedia. We propose SKETCHPARSE, the first deep-network architecture for fully automatic parsing of freehand object sketches. SKETCHPARSE is configured as a two-level fully convolutional network. The first level contains shared layers common to all object categories. The second level contains a number of expert sub-networks. Each expert specializes in parsing sketches from object categories which contain structurally similar parts. Effectively, the two-level configuration enables our architecture to scale up efficiently as additional categories are added. We introduce a router layer which (i) relays sketch features from shared layers to the correct expert (ii) eliminates the need to manually specify object category during inference. To bypass laborious part-level annotation, we sketchify photos from semantic object-part image datasets and use them for training. Our architecture also incorporates object pose prediction as a novel auxiliary task which boosts overall performance while providing supplementary information regarding the sketch. We demonstrate SKETCHPARSE's abilities (i) on two challenging large-scale sketch datasets (ii) in parsing unseen, semantically related object categories (iii) in improving fine-grained sketch-based image retrieval. As a novel application, we also outline how SKETCHPARSE's output can be used to generate caption-style descriptions for hand-drawn sketches.
SwiDeN: convolutional neural networks for depiction invariant object recognition
Santosh Ravi Kiran,Shiv Surya,Srinivas S S Kruthiventi ,Venkatesh Babu R
ACM international conference on Multimedia, ACMMM, 2016
@inproceedings{bib_SwiD_2016, AUTHOR = {Santosh Ravi Kiran, Shiv Surya, Srinivas S S Kruthiventi , Venkatesh Babu R}, TITLE = {SwiDeN: convolutional neural networks for depiction invariant object recognition}, BOOKTITLE = {ACM international conference on Multimedia}. YEAR = {2016}}
Current state of the art object recognition architectures achieve impressive performance but are typically specialized for a single depictive style (eg photos only, sketches only). In this paper, we present SwiDeN: our Convolutional Neural Network (CNN) architecture which recognizes objects regardless of how they are visually depicted (line drawing, realistic shaded drawing, photograph etc.). In SwiDeN, we utilize a noveldeep'depictive style-based switching mechanism which appropriately addresses the depiction-specific and depiction-invariant aspects of the problem. We compare SwiDeN with alternative architectures and prior work on a 50-category Photo-Art dataset containing objects depicted in multiple styles. Experimental results show that SwiDeN outperforms other approaches for the depiction-invariant object recognition problem.
Enabling my robot to play pictionary: Recurrent neural networks for sketch recognition
Santosh Ravi Kiran,Jogendra Kundu,Venkatesh Babu R
ACM international conference on Multimedia, ACMMM, 2016
@inproceedings{bib_Enab_2016, AUTHOR = {Santosh Ravi Kiran, Jogendra Kundu, Venkatesh Babu R}, TITLE = {Enabling my robot to play pictionary: Recurrent neural networks for sketch recognition}, BOOKTITLE = {ACM international conference on Multimedia}. YEAR = {2016}}
Freehand sketching is an inherently sequential process. Yet, most approaches for hand-drawn sketch recognition either ignore this sequential aspect or exploit it in an ad-hoc manner. In our work, we propose a recurrent neural network architecture for sketch object recognition which exploits the long-term sequential and structural regularities in stroke data in a scalable manner. Specifically, we introduce a Gated Recurrent Unit based framework which leverages deep sketch features and weighted per-timestep loss to achieve state-of-the-art results on a large database of freehand object sketches across a large number of object categories. The inherently online nature of our framework is especially suited for on-the-fly recognition of objects as they are being drawn. Thus, our framework can enable interesting applications such as camera-equipped robots playing the popular party game Pictionary with human players and generating sparsified yet recognizable sketches of objects.
A taxonomy of deep convolutional neural nets for computer vision
Suraj Srinivas,Santosh Ravi Kiran,Konda Reddy Mopuri,Nikita Prabhu,Srinivas S S Kruthiventi,R. Venkatesh Babu
Frontiers in Robotics and AI, FRAI, 2016
@inproceedings{bib_A_ta_2016, AUTHOR = {Suraj Srinivas, Santosh Ravi Kiran, Konda Reddy Mopuri, Nikita Prabhu, Srinivas S S Kruthiventi, R. Venkatesh Babu}, TITLE = {A taxonomy of deep convolutional neural nets for computer vision}, BOOKTITLE = {Frontiers in Robotics and AI}. YEAR = {2016}}
Traditional architectures for solving computer vision problems and the degree of success they enjoyed have been heavily reliant on hand-crafted features. However, of late, deep learning techniques have offered a compelling alternative -- that of automatically learning problem-specific features. With this new paradigm, every problem in computer vision is now being re-examined from a deep learning perspective. Therefore, it has become important to understand what kind of deep networks are suitable for a given problem. Although general surveys of this fast-moving paradigm (i.e. deep-networks) exist, a survey specific to computer vision is missing. We specifically consider one form of deep networks widely used in computer vision - convolutional neural networks (CNNs). We start with "AlexNet'' as our base CNN and then examine the broad variations proposed over time to suit different applications. We hope that our recipe-style survey will serve as a guide, particularly for novice practitioners intending to use deep-learning techniques for computer vision.
‘Part’ly first among equals: Semantic part-based benchmarking for state-of-the-art object recognition systems
Santosh Ravi Kiran, Shanthakumar Venkatraman,Venkatesh Babu R
Asian Conference on Computer Vision, ACCV, 2016
@inproceedings{bib_‘P_2016, AUTHOR = {Santosh Ravi Kiran, Shanthakumar Venkatraman, Venkatesh Babu R}, TITLE = {‘Part’ly first among equals: Semantic part-based benchmarking for state-of-the-art object recognition systems}, BOOKTITLE = {Asian Conference on Computer Vision}. YEAR = {2016}}
An examination of object recognition challenge leaderboards (ILSVRC, PASCAL-VOC) reveals that the top-performing classifiers typically exhibit small differences amongst themselves in terms of error rate/mAP. To better differentiate the top performers, additional criteria are required. Moreover, the (test) images, on which the performance scores are based, predominantly contain fully visible objects. Therefore, ‘harder’ test images, mimicking the challenging conditions (e.g. occlusion) in which humans routinely recognize objects, need to be utilized for benchmarking. To address the concerns mentioned above, we make two contributions. First, we systematically vary the level of local object-part content, global detail and spatial context in images from PASCAL VOC 2010 to create a new benchmarking dataset dubbed PPSS-12. Second, we propose an object-part based benchmarking procedure which quantifies classifiers’ robustness to a range of visibility and contextual settings. The benchmarking procedure relies on a semantic similarity measure that naturally addresses potential semantic granularity differences between the category labels in training and test datasets, thus eliminating manual mapping. We use our procedure on the PPSS-12 dataset to benchmark top-performing classifiers trained on the ILSVRC-2012 dataset. Our results show that the proposed benchmarking procedure enables additional differentiation among state-of-the-art object classifiers in terms of their ability to handle missing content and insufficient object detail. Given this capability for additional differentiation, our approach can potentially supplement existing benchmarking procedures used in object recognition challenge leaderboards.
Eye of the dragon: Exploring discriminatively minimalist sketch-based abstractions for object categories
Santosh Ravi Kiran
ACM international conference on Multimedia, ACMMM, 2015
@inproceedings{bib_Eye__2015, AUTHOR = {Santosh Ravi Kiran}, TITLE = {Eye of the dragon: Exploring discriminatively minimalist sketch-based abstractions for object categories}, BOOKTITLE = {ACM international conference on Multimedia}. YEAR = {2015}}
As a form of visual representation, freehand line sketches are typically studied as an end product of the sketching process. However, from a recognition point of view, one can also study various orderings and properties of the primitive strokes that compose the sketch. Studying sketches in this manner has enabled us to create novel sparse yet discriminative sketch-based representations for object categories which we term category-epitomes. Concurrently, the epitome construction provides a natural measure for quantifying the sparseness underlying the original sketch, which we term epitome-score. We analyze category-epitomes and epitome-scores for hand-drawn sketches from a sketch dataset of 160 object categories commonly encountered in daily life. Our analysis provides a novel viewpoint for examining the complexity of representation for visual object categories.
Expresso: A User-Friendly GUI for Designing, Training and Exploring Convolutional Neural Networks
Santosh Ravi Kiran,R. Venkatesh Babu
Technical Report, arXiv, 2015
@inproceedings{bib_Expr_2015, AUTHOR = {Santosh Ravi Kiran, R. Venkatesh Babu}, TITLE = {Expresso: A User-Friendly GUI for Designing, Training and Exploring Convolutional Neural Networks}, BOOKTITLE = {Technical Report}. YEAR = {2015}}
With a view to provide a user-friendly interface for designing, training and developing deep learning frameworks, we have developed Expresso, a GUI tool written in Python. Expresso is built atop Caffe, the open-source, prize-winning framework popularly used to develop Convolutional Neural Networks. Expresso provides a convenient wizard-like graphical interface which guides the user through various common scenarios--data import, construction and training of deep networks, performing various experiments, analyzing and visualizing the results of these experiments. The multi-threaded nature of Expresso enables concurrent execution and notification of events related to the aforementioned scenarios. The GUI sub-components and inter-component interfaces in Expresso have been designed with extensibility in mind. We believe Expresso's flexibility and ease of use will come in handy to researchers, newcomers and seasoned alike, in their explorations related to deep learning.
SKETCH-EPITOMES: DISCRIMINATIVELY MINIMALIST REPRESENTATIONS FOR OBJECT CATEGORIES
Santosh Ravi Kiran,R. Venkatesh Babu
Technical Report, arXiv, 2015
@inproceedings{bib_SKET_2015, AUTHOR = {Santosh Ravi Kiran, R. Venkatesh Babu}, TITLE = {SKETCH-EPITOMES: DISCRIMINATIVELY MINIMALIST REPRESENTATIONS FOR OBJECT CATEGORIES}, BOOKTITLE = {Technical Report}. YEAR = {2015}}
Freehand line sketches are an interesting and unique form of visual representation. Typically, such sketches are studied and utilized as an end product of the sketching process. However, we have found it instructive to study the sketches as sequentially accumulated composition of drawing strokes added over time. Studying sketches in this manner has enabled us to create novel sparse yet discriminative sketch-based representations for object categories which we term category-epitomes. Our procedure for obtaining these epitomes concurrently provides a natural measure for quantifying the sparseness underlying the original sketch, which we term epitome-score. We construct and analyze category-epitomes and epitome-scores for freehand sketches belonging to various object categories. Our analysis provides a novel viewpoint for studying the semantic nature of object categories.
Freehand Sketch Recognition Using Deep Features
Santosh Ravi Kiran,R. Venkatesh Babu
Technical Report, arXiv, 2015
@inproceedings{bib_Free_2015, AUTHOR = {Santosh Ravi Kiran, R. Venkatesh Babu}, TITLE = {Freehand Sketch Recognition Using Deep Features}, BOOKTITLE = {Technical Report}. YEAR = {2015}}
Freehand sketches often contain sparse visual detail. In spite of the sparsity, they are easily and consistently recognized by humans across cultures, languages and age groups. Therefore, analyzing such sparse sketches can aid our understanding of the neuro-cognitive processes involved in visual representation and recognition. In the recent past, Convolutional Neural Networks (CNNs) have emerged as a powerful framework for feature representation and recognition for a variety of image domains. However, the domain of sketch images has not been explored. This paper introduces a freehand sketch recognition framework based on" deep" features extracted from CNNs. We use two popular CNNs for our experiments--Imagenet CNN and a modified version of LeNet CNN. We evaluate our recognition framework on a publicly available benchmark database containing thousands of freehand sketches depicting everyday objects. Our results are an improvement over the existing state-of-the-art accuracies by 3%-11%. The effectiveness and relative compactness of our deep features also make them an ideal candidate for related problems such as sketch-based image retrieval. In addition, we provide a preliminary glimpse of how such features can help identify crucial attributes (eg object-parts) of the sketched objects.
Category-Epitomes: Discriminatively Minimalist Representations for Object Categories
Santosh Ravi Kiran,R. Venkatesh Babu
Technical Report, arXiv, 2015
@inproceedings{bib_Cate_2015, AUTHOR = {Santosh Ravi Kiran, R. Venkatesh Babu}, TITLE = {Category-Epitomes: Discriminatively Minimalist Representations for Object Categories}, BOOKTITLE = {Technical Report}. YEAR = {2015}}
Freehand line sketches are an interesting and unique form of visual representation. Typically, such sketches are studied and utilized as an end product of the sketching process. However, we have found it instructive to study the sketches as sequentially accumulated composition of drawing strokes added over time. Studying sketches in this manner has enabled us to create novel sparse yet discriminative sketch-based representations for object categories which we term category-epitomes. Our procedure for obtaining these epitomes concurrently provides a natural measure for quantifying the sparseness underlying the original sketch, which we term epitome-score. We construct and analyze category-epitomes and epitome-scores for freehand sketches belonging to various object categories. Our analysis provides a novel viewpoint for studying the semantic nature of object categories.