import argparse
import csv
import json
import os
import re
import sys
import xml.etree.ElementTree as ET
import yaml
try:
    import pandas as pd
    PANDAS_AVAILABLE = True
except ImportError:
    PANDAS_AVAILABLE = False

VERSION = "1.0.0"


class FileParser:
    """Main file parser class to handle different file types and operations."""

    def __init__(self):
        self.data = None
        self.file_path = None
        self.file_type = None
        self.output_format = None
        
    def is_data_empty(self):
        """Check if data is empty based on its type."""
        if self.data is None:
            return True
        if PANDAS_AVAILABLE and self.file_type == "pandas":
            return self.data.empty
        if isinstance(self.data, list) or isinstance(self.data, dict):
            return len(self.data) == 0
        if isinstance(self.data, str):
            return len(self.data) == 0
        return False

    def load_file(self, file_path):
        """Load file based on extension."""
        self.file_path = file_path
        _, ext = os.path.splitext(file_path)
        ext = ext.lower()

        if not os.path.exists(file_path):
            print(f"Error: File '{file_path}' not found.")
            sys.exit(1)

        try:
            if ext in ['.csv', '.tsv']:
                delimiter = ',' if ext == '.csv' else '\t'
                if PANDAS_AVAILABLE:
                    self.data = pd.read_csv(file_path, delimiter=delimiter)
                    self.file_type = "pandas"
                else:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        reader = csv.DictReader(f, delimiter=delimiter)
                        self.data = list(reader)
                    self.file_type = "csv"
            elif ext == '.json':
                with open(file_path, 'r', encoding='utf-8') as f:
                    self.data = json.load(f)
                self.file_type = "json"
            elif ext == '.xml':
                self.data = ET.parse(file_path)
                self.file_type = "xml"
            elif ext in ['.yaml', '.yml']:
                with open(file_path, 'r', encoding='utf-8') as f:
                    self.data = yaml.safe_load(f)
                self.file_type = "yaml"
            elif ext in ['.txt', '.log', '.md', '.py', '.js', '.html', '.css']:
                with open(file_path, 'r', encoding='utf-8') as f:
                    self.data = f.read()
                self.file_type = "text"
            else:
                # Binary or unknown format
                with open(file_path, 'rb') as f:
                    self.data = f.read()
                self.file_type = "binary"
            
            print(f"Successfully loaded {file_path}")
            return True
        except Exception as e:
            print(f"Error loading file: {str(e)}")
            return False


    def analyze(self, analyze_type='general'):
        """Analyze file content based on file type."""
        if self.is_data_empty():
            print("No data loaded. Please load a file first.")
            return None

        result = {}
        
        if analyze_type == 'general':
            # General file information
            result["file_name"] = os.path.basename(self.file_path)
            result["file_size"] = os.path.getsize(self.file_path)
            result["file_type"] = self.file_type
            
            if self.file_type == "text":
                result["line_count"] = len(self.data.splitlines())
                result["word_count"] = len(self.data.split())
                result["char_count"] = len(self.data)
            elif self.file_type == "csv" or self.file_type == "pandas":
                if PANDAS_AVAILABLE and self.file_type == "pandas":
                    result["row_count"] = len(self.data)
                    result["column_count"] = len(self.data.columns)
                    result["columns"] = list(self.data.columns)
                else:
                    result["row_count"] = len(self.data)
                    if self.data and len(self.data) > 0:
                        result["column_count"] = len(self.data[0])
                        result["columns"] = list(self.data[0].keys())
            elif self.file_type == "json":
                if isinstance(self.data, list):
                    result["item_count"] = len(self.data)
                elif isinstance(self.data, dict):
                    result["key_count"] = len(self.data)
                    result["keys"] = list(self.data.keys())
            elif self.file_type == "xml":
                root = self.data.getroot()
                result["root_tag"] = root.tag
                result["child_count"] = len(root)
                
        elif analyze_type == 'structure':
            # Analyze data structure
            if self.file_type == "json" or self.file_type == "yaml":
                def get_structure(data, max_depth=3, current_depth=0):
                    if current_depth >= max_depth:
                        return "..."
                    
                    if isinstance(data, dict):
                        return {k: get_structure(v, max_depth, current_depth + 1) for k, v in data.items()}
                    elif isinstance(data, list) and data:
                        if len(data) > 1:
                            return [get_structure(data[0], max_depth, current_depth + 1), "..."]
                        elif len(data) == 1:
                            return [get_structure(data[0], max_depth, current_depth + 1)]
                        else:
                            return []
                    elif isinstance(data, (int, float, bool, str)) or data is None:
                        return type(data).__name__
                    else:
                        return str(type(data).__name__)
            
                result["structure"] = get_structure(self.data)
            
            elif self.file_type == "csv" or self.file_type == "pandas":
                if PANDAS_AVAILABLE and self.file_type == "pandas":
                    result["column_types"] = {col: str(dtype) for col, dtype in self.data.dtypes.items()}
                    result["sample_data"] = self.data.head(3).to_dict(orient='records')
                else:
                    if self.data:
                        result["column_names"] = list(self.data[0].keys())
                        result["sample_data"] = self.data[:3]
            
            elif self.file_type == "xml":
                root = self.data.getroot()
                
                def xml_to_dict(element, max_depth=3, current_depth=0):
                    if current_depth >= max_depth:
                        return "..."
                    
                    result = {}
                    for child in element:
                        if len(child):
                            result[child.tag] = xml_to_dict(child, max_depth, current_depth + 1)
                        else:
                            result[child.tag] = "text" if child.text and child.text.strip() else "empty"
                    
                    if element.attrib:
                        result["@attributes"] = {k: "value" for k in element.attrib.keys()}
                    
                    return result
                
                result["structure"] = {root.tag: xml_to_dict(root)}
        
        elif analyze_type == 'stats' and (self.file_type == "pandas" or self.file_type == "csv"):
            # Statistical analysis for data files
            if PANDAS_AVAILABLE and self.file_type == "pandas":
                numeric_cols = self.data.select_dtypes(include=['number']).columns
                if not numeric_cols.empty:
                    stats = self.data[numeric_cols].describe().to_dict()
                    result["statistics"] = stats
                else:
                    result["statistics"] = "No numeric columns available for statistics"
            else:
                print("Advanced statistics require pandas library. Install with 'pip install pandas'")
                result["statistics"] = "Requires pandas library"
        
        return result

    def search(self, pattern, case_sensitive=False):
        """Search for a pattern in the file."""
        if self.is_data_empty():
            print("No data loaded. Please load a file first.")
            return None
            
        result = {"matches": [], "match_count": 0}
        
        if self.file_type == "text":
            flags = 0 if case_sensitive else re.IGNORECASE
            matches = list(re.finditer(pattern, self.data, flags))
            
            result["match_count"] = len(matches)
            for m in matches[:20]:  # Limit results to avoid overwhelming output
                start, end = max(0, m.start() - 20), min(len(self.data), m.end() + 20)
                context = f"...{self.data[start:end]}..."
                result["matches"].append({
                    "match": m.group(),
                    "position": m.start(),
                    "context": context
                })
                
        elif self.file_type == "csv" or self.file_type == "pandas":
            if PANDAS_AVAILABLE and self.file_type == "pandas":
                # Search across all columns in pandas DataFrame
                pattern_func = lambda x: bool(re.search(pattern, str(x), 0 if case_sensitive else re.IGNORECASE))
                mask = self.data.applymap(pattern_func).any(axis=1)
                matches = self.data[mask]
                
                result["match_count"] = len(matches)
                if not matches.empty:
                    result["matches"] = matches.head(20).to_dict(orient='records')
            else:
                # Search across all fields in CSV
                flags = 0 if case_sensitive else re.IGNORECASE
                matches = []
                
                for row in self.data:
                    for key, value in row.items():
                        if re.search(pattern, str(value), flags):
                            matches.append(row)
                            break
                
                result["match_count"] = len(matches)
                result["matches"] = matches[:20]
                
        elif self.file_type == "json" or self.file_type == "yaml":
            # Convert to string for simplicity
            data_str = json.dumps(self.data)
            flags = 0 if case_sensitive else re.IGNORECASE
            matches = list(re.finditer(pattern, data_str, flags))
            
            result["match_count"] = len(matches)
            result["matches"] = [m.group() for m in matches[:20]]
            
        return result

    def extract(self, query):
        """Extract specific data based on the query and file type."""
        if self.is_data_empty():
            print("No data loaded. Please load a file first.")
            return None
            
        result = {"extracted_data": None}
        
        try:
            if self.file_type == "json" or self.file_type == "yaml":
                # Simple JSON/YAML path extraction (dot notation)
                paths = query.split('.')
                data = self.data
                
                for path in paths:
                    # Handle array indices
                    if '[' in path and path.endswith(']'):
                        key, idx_str = path.split('[', 1)
                        idx = int(idx_str[:-1])
                        
                        if key:
                            data = data[key]
                        data = data[idx]
                    else:
                        data = data[path]
                
                result["extracted_data"] = data
                
            elif self.file_type == "csv" or self.file_type == "pandas":
                if PANDAS_AVAILABLE and self.file_type == "pandas":
                    # Allow SQL-like queries with pandas
                    if query.lower().startswith("where "):
                        # Convert simple where clauses to pandas query
                        query_str = query[6:]  # Remove 'where '
                        df_result = self.data.query(query_str, engine='python')
                        result["extracted_data"] = df_result.to_dict(orient='records')
                    elif query.lower().startswith("select "):
                        # Handle simple column selection
                        cols = query[7:].split(',')  # Remove 'select '
                        cols = [c.strip() for c in cols]
                        result["extracted_data"] = self.data[cols].to_dict(orient='records')
                    else:
                        # Assume it's a column name
                        result["extracted_data"] = self.data[query].to_list()
                else:
                    # For non-pandas CSV processing
                    if query.startswith("column:"):
                        column = query[7:]
                        result["extracted_data"] = [row.get(column) for row in self.data if column in row]
                    else:
                        print("Complex queries require pandas library. Install with 'pip install pandas'")
                        result["extracted_data"] = None
                        
            elif self.file_type == "xml":
                # Simple XPath-like query
                root = self.data.getroot()
                elements = root.findall(query)
                
                result["extracted_data"] = []
                for elem in elements:
                    if elem.text and elem.text.strip():
                        result["extracted_data"].append(elem.text)
                    else:
                        # Get element attributes
                        elem_data = {"tag": elem.tag}
                        if elem.attrib:
                            elem_data["attributes"] = elem.attrib
                        result["extracted_data"].append(elem_data)
                        
            elif self.file_type == "text":
                # For text files, extract via regex pattern
                flags = re.MULTILINE
                matches = re.findall(query, self.data, flags)
                result["extracted_data"] = matches
                
        except Exception as e:
            print(f"Error in extraction: {str(e)}")
            result["error"] = str(e)
            
        return result

    def transform(self, transform_type, output_file=None):
        """Transform file into a different format."""
        if self.is_data_empty():
            print("No data loaded. Please load a file first.")
            return False

        output_path = output_file or f"{os.path.splitext(self.file_path)[0]}.{transform_type}"
        
        try:
            if transform_type == "json":
                # Convert to JSON
                if self.file_type == "csv" or self.file_type == "pandas":
                    if PANDAS_AVAILABLE and self.file_type == "pandas":
                        with open(output_path, 'w', encoding='utf-8') as f:
                            json.dump(self.data.to_dict(orient='records'), f, indent=2)
                    else:
                        with open(output_path, 'w', encoding='utf-8') as f:
                            json.dump(self.data, f, indent=2)
                elif self.file_type == "yaml":
                    with open(output_path, 'w', encoding='utf-8') as f:
                        json.dump(self.data, f, indent=2)
                elif self.file_type == "xml":
                    # Simple XML to JSON conversion
                    root = self.data.getroot()
                    
                    def xml_to_dict(element):
                        result = {}
                        for child in element:
                            if len(child):
                                result[child.tag] = xml_to_dict(child)
                            else:
                                result[child.tag] = child.text or ""
                        
                        if element.attrib:
                            result["@attributes"] = element.attrib
                        
                        return result
                    
                    with open(output_path, 'w', encoding='utf-8') as f:
                        json.dump({root.tag: xml_to_dict(root)}, f, indent=2)
                else:
                    print(f"Cannot convert {self.file_type} to JSON")
                    return False
                    
            elif transform_type == "csv":
                # Convert to CSV
                if self.file_type == "json" or self.file_type == "yaml":
                    if PANDAS_AVAILABLE:
                        pd.DataFrame(self.data).to_csv(output_path, index=False)
                    else:
                        if isinstance(self.data, list):
                            keys = set()
                            for item in self.data:
                                if isinstance(item, dict):
                                    keys.update(item.keys())
                            
                            with open(output_path, 'w', newline='', encoding='utf-8') as f:
                                writer = csv.DictWriter(f, fieldnames=list(keys))
                                writer.writeheader()
                                for item in self.data:
                                    if isinstance(item, dict):
                                        writer.writerow(item)
                        else:
                            print("JSON/YAML must contain a list of objects to convert to CSV")
                            return False
                elif self.file_type == "xml":
                    # Simple XML to CSV conversion
                    root = self.data.getroot()
                    rows = []
                    
                    for child in root:
                        row = {}
                        for elem in child:
                            row[elem.tag] = elem.text or ""
                        rows.append(row)
                    
                    if rows:
                        with open(output_path, 'w', newline='', encoding='utf-8') as f:
                            writer = csv.DictWriter(f, fieldnames=list(rows[0].keys()))
                            writer.writeheader()
                            writer.writerows(rows)
                    else:
                        print("No suitable data structure found in XML for CSV conversion")
                        return False
                else:
                    print(f"Cannot convert {self.file_type} to CSV")
                    return False
                    
            elif transform_type == "yaml":
                # Convert to YAML
                if self.file_type == "json":
                    if yaml:
                        with open(output_path, 'w', encoding='utf-8') as f:
                            yaml.dump(self.data, f, default_flow_style=False)
                    else:
                        print("YAML conversion requires PyYAML. Install with 'pip install pyyaml'")
                        return False
                elif self.file_type == "csv" or self.file_type == "pandas":
                    if yaml:
                        if PANDAS_AVAILABLE and self.file_type == "pandas":
                            with open(output_path, 'w', encoding='utf-8') as f:
                                yaml.dump(self.data.to_dict(orient='records'), f, default_flow_style=False)
                        else:
                            with open(output_path, 'w', encoding='utf-8') as f:
                                yaml.dump(self.data, f, default_flow_style=False)
                    else:
                        print("YAML conversion requires PyYAML. Install with 'pip install pyyaml'")
                        return False
                else:
                    print(f"Cannot convert {self.file_type} to YAML")
                    return False
                    
            elif transform_type == "text":
                # Convert to plain text
                with open(output_path, 'w', encoding='utf-8') as f:
                    if self.file_type == "json" or self.file_type == "yaml":
                        json.dump(self.data, f, indent=2)
                    elif self.file_type == "csv" or self.file_type == "pandas":
                        if PANDAS_AVAILABLE and self.file_type == "pandas":
                            f.write(self.data.to_string())
                        else:
                            writer = csv.writer(f)
                            if self.data:
                                writer.writerow(self.data[0].keys())
                                for row in self.data:
                                    writer.writerow(row.values())
                    elif self.file_type == "xml":
                        f.write(ET.tostring(self.data.getroot(), encoding='unicode'))
                    else:
                        f.write(str(self.data))
            else:
                print(f"Unsupported output format: {transform_type}")
                return False
                
            print(f"Successfully transformed to {output_path}")
            return True
            
        except Exception as e:
            print(f"Error in transformation: {str(e)}")
            return False

    def output_results(self, data, format_type="terminal"):
        """Output results in specified format."""
        if format_type == "terminal":
            if isinstance(data, dict):
                print(json.dumps(data, indent=2))
            else:
                print(data)
        elif format_type == "json":
            print(json.dumps(data, indent=2))
        elif format_type == "csv":
            if isinstance(data, list) and all(isinstance(item, dict) for item in data):
                writer = csv.DictWriter(sys.stdout, fieldnames=data[0].keys())
                writer.writeheader()
                writer.writerows(data)
            elif isinstance(data, dict):
                for key, value in data.items():
                    print(f"{key},{value}")
            else:
                print("Data format not suitable for CSV output")


def main():
    parser = argparse.ArgumentParser(description="File Parser CLI Tool")
    
    # Main arguments
    parser.add_argument("file", nargs="?", help="Path to the file to parse")
    parser.add_argument("--version", action="store_true", help="Show version information")
    
    # Operations
    operation_group = parser.add_argument_group("Operations")
    operation_group.add_argument("--analyze", action="store_true", help="Analyze file content")
    operation_group.add_argument("--analyze-type", choices=["general", "structure", "stats"], 
                                default="general", help="Type of analysis to perform")
    operation_group.add_argument("--search", metavar="PATTERN", help="Search for a pattern in the file")
    operation_group.add_argument("--extract", metavar="QUERY",
                                help="Extract specific data from the file using a query")
    operation_group.add_argument("--transform", metavar="FORMAT",
                                choices=["json", "csv", "yaml", "text"],
                                help="Transform file to a different format")
    
    # Options
    options_group = parser.add_argument_group("Options")
    options_group.add_argument("--output", metavar="FILE", help="Output file for transformed data")
    options_group.add_argument("--format", choices=["terminal", "json", "csv"], 
                              default="terminal", help="Output format for results")
    options_group.add_argument("--case-sensitive", action="store_true", 
                              help="Make search case-sensitive")
    
    args = parser.parse_args()
    
    if args.version:
        print(f"File Parser CLI Tool v{VERSION}")
        sys.exit(0)
    
    if not args.file:
        parser.print_help()
        sys.exit(1)
        
    file_parser = FileParser()
    
    if not file_parser.load_file(args.file):
        sys.exit(1)
    
    # Perform operations
    if args.analyze:
        result = file_parser.analyze(args.analyze_type)
        file_parser.output_results(result, args.format)
    elif args.search:
        result = file_parser.search(args.search, args.case_sensitive)
        file_parser.output_results(result, args.format)
    elif args.extract:
        result = file_parser.extract(args.extract)
        file_parser.output_results(result, args.format)
    elif args.transform:
        file_parser.transform(args.transform, args.output)
    else:
        # Default to general analysis
        result = file_parser.analyze("general")
        file_parser.output_results(result, args.format)


if __name__ == "__main__":
    main()
    
    
    
#### 
# python3 file_parser_cli_tool.py /home/iauro/sales_data.csv

# python3 file_parser_cli_tool.py /home/iauro/sales_data.csv --analyze --analyze-type structure

# python3 file_parser_cli_tool.py /home/iauro/sales_data.csv --search "besan"

# python3 file_parser_cli_tool.py /home/iauro/sales_data.csv --extract "select customer_id"

# python3 file_parser_cli_tool.py /home/iauro/sales_data.csv --extract "where customer_id > 100"

# python3 file_parser_cli_tool.py /home/iauro/sales_data.csv --transform json --output sales_data.json

