a more sensible method to detect text files

This commit is contained in:
Alexandre Pasmantier 2024-10-18 00:48:37 +02:00
parent 49a3948b51
commit d2213af480
8 changed files with 56 additions and 26 deletions

12
Cargo.lock generated
View File

@ -2153,6 +2153,15 @@ version = "0.1.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
[[package]]
name = "rustc_version"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92"
dependencies = [
"semver",
]
[[package]]
name = "rustix"
version = "0.38.37"
@ -2419,7 +2428,7 @@ dependencies = [
[[package]]
name = "television"
version = "0.1.5"
version = "0.1.6"
dependencies = [
"anyhow",
"better-panic",
@ -2832,6 +2841,7 @@ dependencies = [
"cargo_metadata",
"derive_builder",
"regex",
"rustc_version",
"rustversion",
"time",
"vergen-lib",

View File

@ -1,6 +1,6 @@
[package]
name = "television"
version = "0.1.5"
version = "0.1.6"
edition = "2021"
description = "The revolution will be televised."
license = "MIT"
@ -72,7 +72,7 @@ pretty_assertions = "1.4.1"
[build-dependencies]
anyhow = "1.0.86"
vergen-gix = { version = "1.0.0", features = ["build", "cargo"] }
vergen-gix = { version = "1.0.0", features = ["build", "cargo", "rustc"] }
[profile.staging]

View File

@ -1,13 +1,15 @@
use anyhow::Result;
use vergen_gix::{BuildBuilder, CargoBuilder, Emitter, GixBuilder};
use vergen_gix::{
BuildBuilder, CargoBuilder, Emitter, GixBuilder, RustcBuilder,
};
fn main() -> Result<()> {
let build = BuildBuilder::all_build()?;
let gix = GixBuilder::all_git()?;
let cargo = CargoBuilder::all_cargo()?;
let build = BuildBuilder::default().build_date(true).build()?;
let cargo = CargoBuilder::default().target_triple(true).build()?;
let rustc = RustcBuilder::default().semver(true).build()?;
Emitter::default()
.add_instructions(&build)?
.add_instructions(&gix)?
.add_instructions(&cargo)?
.add_instructions(&rustc)?
.emit()
}

View File

@ -3,15 +3,17 @@ use nucleo::{
pattern::{CaseMatching, Normalization},
Config, Injector, Nucleo,
};
use std::{path::PathBuf, sync::Arc};
use std::{os::unix::ffi::OsStrExt, path::PathBuf, sync::Arc};
use ignore::DirEntry;
use super::TelevisionChannel;
use crate::entry::Entry;
use crate::fuzzy::MATCHER;
use crate::previewers::PreviewType;
use crate::utils::files::{walk_builder, DEFAULT_NUM_THREADS};
use crate::{
entry::Entry, utils::strings::proportion_of_printable_ascii_characters,
};
use crate::{fuzzy::MATCHER, utils::strings::PRINTABLE_ASCII_THRESHOLD};
pub(crate) struct Channel {
matcher: Nucleo<DirEntry>,
@ -19,6 +21,8 @@ pub(crate) struct Channel {
result_count: u32,
total_count: u32,
running: bool,
// TODO: cache results (to make deleting characters smoother) but like
// a shallow cache (maybe more like a stack actually? so we just pop result sets)
}
impl Channel {
@ -131,6 +135,13 @@ async fn load_files(path: PathBuf, injector: Injector<DirEntry>) {
if let Ok(entry) = result {
if entry.file_type().unwrap().is_file() {
// Send the path via the async channel
let file_name = entry.file_name();
if proportion_of_printable_ascii_characters(
file_name.as_bytes(),
) < PRINTABLE_ASCII_THRESHOLD
{
return ignore::WalkState::Continue;
}
let _ = injector.push(entry, |e, cols| {
cols[0] = e
.path()

View File

@ -13,13 +13,15 @@ use std::{
use tracing::{debug, info};
use super::TelevisionChannel;
use crate::entry::Entry;
use crate::fuzzy::MATCHER;
use crate::previewers::PreviewType;
use crate::utils::{
files::{is_not_text, is_valid_utf8, walk_builder, DEFAULT_NUM_THREADS},
strings::preprocess_line,
};
use crate::{
entry::Entry, utils::strings::proportion_of_printable_ascii_characters,
};
use crate::{fuzzy::MATCHER, utils::strings::PRINTABLE_ASCII_THRESHOLD};
#[derive(Debug)]
struct CandidateLine {
@ -184,7 +186,8 @@ async fn load_candidates(path: PathBuf, injector: Injector<CandidateLine>) {
if (bytes_read == 0)
|| is_not_text(&buffer)
.unwrap_or(false)
|| !is_valid_utf8(&buffer)
|| proportion_of_printable_ascii_characters(&buffer)
< PRINTABLE_ASCII_THRESHOLD
{
return ignore::WalkState::Continue;
}

View File

@ -21,8 +21,10 @@ pub(crate) struct Cli {
const VERSION_MESSAGE: &str = concat!(
env!("CARGO_PKG_VERSION"),
"-",
env!("VERGEN_GIT_DESCRIBE"),
"\ntarget triple: ",
env!("VERGEN_CARGO_TARGET_TRIPLE"),
"\nbuild: ",
env!("VERGEN_RUSTC_SEMVER"),
" (",
env!("VERGEN_BUILD_DATE"),
")"

View File

@ -17,11 +17,11 @@ use tracing::{debug, warn};
use crate::entry;
use crate::previewers::{Preview, PreviewContent};
use crate::utils::files::is_valid_utf8;
use crate::utils::files::FileType;
use crate::utils::files::{get_file_size, is_known_text_extension};
use crate::utils::strings::{
preprocess_line, proportion_of_printable_ascii_characters,
PRINTABLE_ASCII_THRESHOLD,
};
use super::cache::PreviewCache;
@ -105,7 +105,8 @@ impl FilePreviewer {
FileType::Image => {
debug!("Previewing image file: {:?}", entry.name);
// insert a loading preview into the cache
let preview = loading(&entry.name);
//let preview = loading(&entry.name);
let preview = not_supported(&entry.name);
self.cache_preview(entry.name.clone(), preview.clone())
.await;
//// compute the image preview in the background
@ -199,9 +200,6 @@ impl FilePreviewer {
/// 4 MB
const MAX_FILE_SIZE: u64 = 4 * 1024 * 1024;
/// The proportion of printable ascii characters that a file must have to be considered text.
const PRINTABLE_ASCII_THRESHOLD: f32 = 0.9;
fn get_file_type(&self, path: &Path) -> FileType {
debug!("Getting file type for {:?}", path);
let mut file_type = match infer::get_from_path(path) {
@ -225,12 +223,9 @@ impl FilePreviewer {
} else if let Ok(mut f) = File::open(path) {
let mut buffer = [0u8; 256];
if let Ok(bytes_read) = f.read(&mut buffer) {
// TODO: add a check for the proportion of non printable characters (binary
// files)
if bytes_read > 0
&& is_valid_utf8(&buffer)
&& proportion_of_printable_ascii_characters(&buffer)
> Self::PRINTABLE_ASCII_THRESHOLD
> PRINTABLE_ASCII_THRESHOLD
{
file_type = FileType::Text;
}

View File

@ -54,6 +54,7 @@ lazy_static! {
pub const EMPTY_STRING: &str = "";
pub const FOUR_SPACES: &str = " ";
pub const TAB_WIDTH: usize = 4;
const SPACE_CHARACTER: char = ' ';
const TAB_CHARACTER: char = '\t';
@ -108,6 +109,12 @@ pub(crate) fn replace_nonprintable(input: &[u8], tab_width: usize) -> String {
output
}
/// The threshold for considering a buffer to be printable ASCII.
///
/// This is used to determine whether a file is likely to be a text file
/// based on a sample of its contents.
pub const PRINTABLE_ASCII_THRESHOLD: f32 = 0.7;
pub(crate) fn proportion_of_printable_ascii_characters(buffer: &[u8]) -> f32 {
let mut printable = 0;
for &byte in buffer {
@ -131,7 +138,7 @@ pub(crate) fn preprocess_line(line: &str) -> String {
}
.trim_end_matches(['\r', '\n', '\0'])
.as_bytes(),
2,
TAB_WIDTH,
)
}