package pso

import (
	"sort"
	"strings"

	pdfreader "github.com/ledongthuc/pdf"
)

const pdfRowGapThreshold = 14.0

func loadPDFRows(filePath string) ([][]string, error) {
	file, reader, err := pdfreader.Open(filePath)
	if err != nil {
		return nil, err
	}
	defer file.Close()

	rows := make([][]string, 0)
	totalPages := reader.NumPage()
	for pageIndex := 1; pageIndex <= totalPages; pageIndex++ {
		page := reader.Page(pageIndex)
		if page.V.IsNull() {
			continue
		}
		
		contents := page.V.Key("Contents")
		if contents.IsNull() || contents.Kind() == pdfreader.Null {
			continue
		}

		pageRows, err := page.GetTextByRow()
		if err != nil {
			continue // Skip problematic pages
		}

		for rowIndex := len(pageRows) - 1; rowIndex >= 0; rowIndex-- {
			row := pageRows[rowIndex]
			cells := collapsePDFRowCells(row.Content)
			if len(cells) == 0 {
				continue
			}
			normalized := make([]string, len(cells))
			for i, cell := range cells {
				normalized[i] = normalizePDFCellText(cell)
			}
			rows = append(rows, normalized)
		}
	}

	return rows, nil
}

func normalizePDFCellText(value string) string {
	trimmed := strings.TrimSpace(value)
	if trimmed == "" {
		return ""
	}

	tokens := strings.Fields(trimmed)
	if len(tokens) == 0 {
		return ""
	}

	allSingleCharacters := true
	for _, token := range tokens {
		if len(token) != 1 {
			allSingleCharacters = false
			break
		}
	}

	if allSingleCharacters {
		return strings.ReplaceAll(trimmed, " ", "")
	}

	return strings.Join(tokens, " ")
}

func collapsePDFRowCells(texts pdfreader.TextHorizontal) []string {
	if len(texts) == 0 {
		return nil
	}

	items := make([]pdfreader.Text, 0, len(texts))
	for _, text := range texts {
		if strings.TrimSpace(text.S) != "" {
			items = append(items, text)
		}
	}
	if len(items) == 0 {
		return nil
	}

	sort.SliceStable(items, func(i, j int) bool {
		if items[i].X == items[j].X {
			return items[i].Y > items[j].Y
		}
		return items[i].X < items[j].X
	})

	cells := make([]string, 0, 11)
	current := strings.Builder{}
	var lastEnd float64
	hasCurrent := false

	flushCurrent := func() {
		if !hasCurrent {
			return
		}
		cell := strings.TrimSpace(current.String())
		if cell != "" {
			cells = append(cells, cell)
		}
		current.Reset()
		hasCurrent = false
		lastEnd = 0
	}

	for _, item := range items {
		value := strings.TrimSpace(item.S)
		if value == "" {
			continue
		}

		if !hasCurrent {
			current.WriteString(value)
			hasCurrent = true
			lastEnd = item.X + item.W
			continue
		}

		gap := item.X - lastEnd
		if gap > pdfRowGapThreshold {
			flushCurrent()
			current.WriteString(value)
			hasCurrent = true
			lastEnd = item.X + item.W
			continue
		}

		current.WriteString(" ")
		current.WriteString(value)
		if end := item.X + item.W; end > lastEnd {
			lastEnd = end
		}
	}

	flushCurrent()

	if len(cells) > 11 {
		cells = append(cells[:10], strings.Join(cells[10:], " "))
	}

	return cells
}
